Index: head/sys/geom/bde/g_bde.c =================================================================== --- head/sys/geom/bde/g_bde.c (revision 326269) +++ head/sys/geom/bde/g_bde.c (revision 326270) @@ -1,292 +1,294 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define BDE_CLASS_NAME "BDE" FEATURE(geom_bde, "GEOM-based Disk Encryption"); static void g_bde_start(struct bio *bp) { switch (bp->bio_cmd) { case BIO_DELETE: case BIO_READ: case BIO_WRITE: g_bde_start1(bp); break; case BIO_GETATTR: g_io_deliver(bp, EOPNOTSUPP); break; default: g_io_deliver(bp, EOPNOTSUPP); return; } return; } static void g_bde_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_orphan(%p/%s)", cp, cp->provider->name); g_topology_assert(); gp = cp->geom; sc = gp->softc; gp->flags |= G_GEOM_WITHER; LIST_FOREACH(pp, &gp->provider, provider) g_wither_provider(pp, ENXIO); bzero(sc, sizeof(struct g_bde_softc)); /* destroy evidence */ return; } static int g_bde_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) { de++; dr++; } /* ... and let go of it on last close */ if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) { de--; dr--; } return (g_access(cp, dr, dw, de)); } static void g_bde_create_geom(struct gctl_req *req, struct g_class *mp, struct g_provider *pp) { struct g_geom *gp; struct g_consumer *cp; struct g_bde_key *kp; int error, i; u_int sectorsize; off_t mediasize; struct g_bde_softc *sc; void *pass; void *key; g_trace(G_T_TOPOLOGY, "g_bde_create_geom(%s, %s)", mp->name, pp->name); g_topology_assert(); gp = NULL; gp = g_new_geomf(mp, "%s.bde", pp->name); cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, 1, 1); if (error) { g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); gctl_error(req, "could not access consumer"); return; } pass = NULL; key = NULL; do { pass = gctl_get_param(req, "pass", &i); if (pass == NULL || i != SHA512_DIGEST_LENGTH) { gctl_error(req, "No usable key presented"); break; } key = gctl_get_param(req, "key", &i); if (key != NULL && i != 16) { gctl_error(req, "Invalid key presented"); break; } sectorsize = cp->provider->sectorsize; mediasize = cp->provider->mediasize; sc = g_malloc(sizeof(struct g_bde_softc), M_WAITOK | M_ZERO); gp->softc = sc; sc->geom = gp; sc->consumer = cp; error = g_bde_decrypt_lock(sc, pass, key, mediasize, sectorsize, NULL); bzero(sc->sha2, sizeof sc->sha2); if (error) break; kp = &sc->key; /* Initialize helper-fields */ kp->keys_per_sector = kp->sectorsize / G_BDE_SKEYLEN; kp->zone_cont = kp->keys_per_sector * kp->sectorsize; kp->zone_width = kp->zone_cont + kp->sectorsize; kp->media_width = kp->sectorN - kp->sector0 - G_BDE_MAXKEYS * kp->sectorsize; /* Our external parameters */ sc->zone_cont = kp->zone_cont; sc->mediasize = g_bde_max_sector(kp); sc->sectorsize = kp->sectorsize; TAILQ_INIT(&sc->freelist); TAILQ_INIT(&sc->worklist); mtx_init(&sc->worklist_mutex, "g_bde_worklist", NULL, MTX_DEF); /* XXX: error check */ kproc_create(g_bde_worker, gp, &sc->thread, 0, 0, "g_bde %s", gp->name); pp = g_new_providerf(gp, "%s", gp->name); pp->stripesize = kp->zone_cont; pp->stripeoffset = 0; pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; g_error_provider(pp, 0); break; } while (0); if (pass != NULL) bzero(pass, SHA512_DIGEST_LENGTH); if (key != NULL) bzero(key, 16); if (error == 0) return; g_access(cp, -1, -1, -1); g_detach(cp); g_destroy_consumer(cp); if (gp->softc != NULL) g_free(gp->softc); g_destroy_geom(gp); switch (error) { case ENOENT: gctl_error(req, "Lock was destroyed"); break; case ESRCH: gctl_error(req, "Lock was nuked"); break; case EINVAL: gctl_error(req, "Could not open lock"); break; case ENOTDIR: gctl_error(req, "Lock not found"); break; default: gctl_error(req, "Could not open lock (%d)", error); break; } return; } static int g_bde_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_destroy_geom(%s, %s)", mp->name, gp->name); g_topology_assert(); /* * Orderly detachment. */ KASSERT(gp != NULL, ("NULL geom")); pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("NULL provider")); if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) return (EBUSY); sc = gp->softc; cp = LIST_FIRST(&gp->consumer); KASSERT(cp != NULL, ("NULL consumer")); sc->dead = 1; wakeup(sc); g_access(cp, -1, -1, -1); g_detach(cp); g_destroy_consumer(cp); while (sc->dead != 2 && !LIST_EMPTY(&pp->consumers)) tsleep(sc, PRIBIO, "g_bdedie", hz); mtx_destroy(&sc->worklist_mutex); bzero(&sc->key, sizeof sc->key); g_free(sc); g_wither_geom(gp, ENXIO); return (0); } static void g_bde_ctlreq(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; struct g_provider *pp; if (!strcmp(verb, "create geom")) { pp = gctl_get_provider(req, "provider"); if (pp != NULL) g_bde_create_geom(req, mp, pp); } else if (!strcmp(verb, "destroy geom")) { gp = gctl_get_geom(req, mp, "geom"); if (gp != NULL) g_bde_destroy_geom(req, mp, gp); } else { gctl_error(req, "unknown verb"); } } static struct g_class g_bde_class = { .name = BDE_CLASS_NAME, .version = G_VERSION, .destroy_geom = g_bde_destroy_geom, .ctlreq = g_bde_ctlreq, .start = g_bde_start, .orphan = g_bde_orphan, .access = g_bde_access, .spoiled = g_std_spoiled, }; DECLARE_GEOM_CLASS(g_bde_class, g_bde); Index: head/sys/geom/bde/g_bde.h =================================================================== --- head/sys/geom/bde/g_bde.h (revision 326269) +++ head/sys/geom/bde/g_bde.h (revision 326270) @@ -1,211 +1,213 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_GEOM_BDE_G_BDE_H_ #define _SYS_GEOM_BDE_G_BDE_H_ 1 /* * These are quite, but not entirely unlike constants. * * They are not commented in details here, to prevent unadvisable * experimentation. Please consult the code where they are used before you * even think about modifying these. */ #define G_BDE_MKEYLEN (2048/8) #define G_BDE_SKEYBITS 128 #define G_BDE_SKEYLEN (G_BDE_SKEYBITS/8) #define G_BDE_KKEYBITS 128 #define G_BDE_KKEYLEN (G_BDE_KKEYBITS/8) #define G_BDE_MAXKEYS 4 #define G_BDE_LOCKSIZE 384 #define NLOCK_FIELDS 13 /* This just needs to be "large enough" */ #define G_BDE_KEYBYTES 304 struct g_bde_work; struct g_bde_softc; struct g_bde_sector { struct g_bde_work *owner; struct g_bde_softc *softc; off_t offset; u_int size; u_int ref; void *data; TAILQ_ENTRY(g_bde_sector) list; u_char valid; u_char malloc; enum {JUNK, IO, VALID} state; int error; time_t used; }; struct g_bde_work { struct mtx mutex; off_t offset; off_t length; void *data; struct bio *bp; struct g_bde_softc *softc; off_t so; off_t kso; u_int ko; struct g_bde_sector *sp; struct g_bde_sector *ksp; TAILQ_ENTRY(g_bde_work) list; enum {SETUP, WAIT, FINISH} state; int error; }; /* * The decrypted contents of the lock sectors. Notice that this is not * the same as the on-disk layout. The on-disk layout is dynamic and * dependent on the pass-phrase. */ struct g_bde_key { uint64_t sector0; /* Physical byte offset of 1st byte used */ uint64_t sectorN; /* Physical byte offset of 1st byte not used */ uint64_t keyoffset; /* Number of bytes the disk image is skewed. */ uint64_t lsector[G_BDE_MAXKEYS]; /* Physical byte offsets of lock sectors */ uint32_t sectorsize; /* Our "logical" sector size */ uint32_t flags; #define GBDE_F_SECT0 1 uint8_t salt[16]; /* Used to frustate the kkey generation */ uint8_t spare[32]; /* For future use, random contents */ uint8_t mkey[G_BDE_MKEYLEN]; /* Our masterkey. */ /* Non-stored help-fields */ uint64_t zone_width; /* On-disk width of zone */ uint64_t zone_cont; /* Payload width of zone */ uint64_t media_width; /* Non-magic width of zone */ u_int keys_per_sector; }; struct g_bde_softc { off_t mediasize; u_int sectorsize; uint64_t zone_cont; struct g_geom *geom; struct g_consumer *consumer; TAILQ_HEAD(, g_bde_sector) freelist; TAILQ_HEAD(, g_bde_work) worklist; struct mtx worklist_mutex; struct proc *thread; struct g_bde_key key; int dead; u_int nwork; u_int nsect; u_int ncache; u_char sha2[SHA512_DIGEST_LENGTH]; }; /* g_bde_crypt.c */ void g_bde_crypt_delete(struct g_bde_work *wp); void g_bde_crypt_read(struct g_bde_work *wp); void g_bde_crypt_write(struct g_bde_work *wp); /* g_bde_key.c */ void g_bde_zap_key(struct g_bde_softc *sc); int g_bde_get_key(struct g_bde_softc *sc, void *ptr, int len); int g_bde_init_keybytes(struct g_bde_softc *sc, char *passp, int len); /* g_bde_lock .c */ int g_bde_encode_lock(u_char *sha2, struct g_bde_key *gl, u_char *ptr); int g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr); int g_bde_keyloc_encrypt(u_char *sha2, uint64_t v0, uint64_t v1, void *output); int g_bde_keyloc_decrypt(u_char *sha2, void *input, uint64_t *output); int g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey); void g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len); /* g_bde_math .c */ uint64_t g_bde_max_sector(struct g_bde_key *lp); void g_bde_map_sector(struct g_bde_work *wp); /* g_bde_work.c */ void g_bde_start1(struct bio *bp); void g_bde_worker(void *arg); /* * These four functions wrap the raw Rijndael functions and make sure we * explode if something fails which shouldn't. */ static __inline void AES_init(cipherInstance *ci) { int error; error = rijndael_cipherInit(ci, MODE_CBC, NULL); KASSERT(error > 0, ("rijndael_cipherInit %d", error)); } static __inline void AES_makekey(keyInstance *ki, int dir, u_int len, const void *key) { int error; error = rijndael_makeKey(ki, dir, len, key); KASSERT(error > 0, ("rijndael_makeKey %d", error)); } static __inline void AES_encrypt(cipherInstance *ci, keyInstance *ki, const void *in, void *out, u_int len) { int error; error = rijndael_blockEncrypt(ci, ki, in, len * 8, out); KASSERT(error > 0, ("rijndael_blockEncrypt %d", error)); } static __inline void AES_decrypt(cipherInstance *ci, keyInstance *ki, const void *in, void *out, u_int len) { int error; error = rijndael_blockDecrypt(ci, ki, in, len * 8, out); KASSERT(error > 0, ("rijndael_blockDecrypt %d", error)); } #endif /* _SYS_GEOM_BDE_G_BDE_H_ */ Index: head/sys/geom/bde/g_bde_crypt.c =================================================================== --- head/sys/geom/bde/g_bde_crypt.c (revision 326269) +++ head/sys/geom/bde/g_bde_crypt.c (revision 326270) @@ -1,360 +1,362 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* This source file contains the functions responsible for the crypto, keying * and mapping operations on the I/O requests. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * XXX: Debugging DO NOT ENABLE */ #undef MD5_KEY /* * Derive kkey from mkey + sector offset. * * Security objective: Derive a potentially very large number of distinct skeys * from the comparatively small key material in our mkey, in such a way that * if one, more or even many of the kkeys are compromised, this does not * significantly help an attack on other kkeys and in particular does not * weaken or compromise the mkey. * * First we MD5 hash the sectornumber with the salt from the lock sector. * The salt prevents the precalculation and statistical analysis of the MD5 * output which would be possible if we only gave it the sectornumber. * * The MD5 hash is used to pick out 16 bytes from the masterkey, which * are then hashed with MD5 together with the sector number. * * The resulting MD5 hash is the kkey. */ static void g_bde_kkey(struct g_bde_softc *sc, keyInstance *ki, int dir, off_t sector) { u_int t; MD5_CTX ct; u_char buf[16]; u_char buf2[8]; /* We have to be architecture neutral */ le64enc(buf2, sector); MD5Init(&ct); MD5Update(&ct, sc->key.salt, 8); MD5Update(&ct, buf2, sizeof buf2); MD5Update(&ct, sc->key.salt + 8, 8); MD5Final(buf, &ct); MD5Init(&ct); for (t = 0; t < 16; t++) { MD5Update(&ct, &sc->key.mkey[buf[t]], 1); if (t == 8) MD5Update(&ct, buf2, sizeof buf2); } bzero(buf2, sizeof buf2); MD5Final(buf, &ct); bzero(&ct, sizeof ct); AES_makekey(ki, dir, G_BDE_KKEYBITS, buf); bzero(buf, sizeof buf); } /* * Encryption work for read operation. * * Security objective: Find the kkey, find the skey, decrypt the sector data. */ void g_bde_crypt_read(struct g_bde_work *wp) { struct g_bde_softc *sc; u_char *d; u_int n; off_t o; u_char skey[G_BDE_SKEYLEN]; keyInstance ki; cipherInstance ci; AES_init(&ci); sc = wp->softc; o = 0; for (n = 0; o < wp->length; n++, o += sc->sectorsize) { d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; g_bde_kkey(sc, &ki, DIR_DECRYPT, wp->offset + o); AES_decrypt(&ci, &ki, d, skey, sizeof skey); d = (u_char *)wp->data + o; AES_makekey(&ki, DIR_DECRYPT, G_BDE_SKEYBITS, skey); AES_decrypt(&ci, &ki, d, d, sc->sectorsize); } bzero(skey, sizeof skey); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); } /* * Encryption work for write operation. * * Security objective: Create random skey, encrypt sector data, * encrypt skey with the kkey. */ void g_bde_crypt_write(struct g_bde_work *wp) { u_char *s, *d; struct g_bde_softc *sc; u_int n; off_t o; u_char skey[G_BDE_SKEYLEN]; keyInstance ki; cipherInstance ci; sc = wp->softc; AES_init(&ci); o = 0; for (n = 0; o < wp->length; n++, o += sc->sectorsize) { s = (u_char *)wp->data + o; d = (u_char *)wp->sp->data + o; arc4rand(skey, sizeof skey, 0); AES_makekey(&ki, DIR_ENCRYPT, G_BDE_SKEYBITS, skey); AES_encrypt(&ci, &ki, s, d, sc->sectorsize); d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; g_bde_kkey(sc, &ki, DIR_ENCRYPT, wp->offset + o); AES_encrypt(&ci, &ki, skey, d, sizeof skey); bzero(skey, sizeof skey); } bzero(skey, sizeof skey); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); } /* * Encryption work for delete operation. * * Security objective: Write random data to the sectors. * * XXX: At a hit in performance we would trash the encrypted skey as well. * XXX: This would add frustration to the cleaning lady attack by making * XXX: deletes look like writes. */ void g_bde_crypt_delete(struct g_bde_work *wp) { struct g_bde_softc *sc; u_char *d; off_t o; u_char skey[G_BDE_SKEYLEN]; keyInstance ki; cipherInstance ci; sc = wp->softc; d = wp->sp->data; AES_init(&ci); /* * Do not unroll this loop! * Our zone may be significantly wider than the amount of random * bytes arc4rand likes to give in one reseeding, whereas our * sectorsize is far more likely to be in the same range. */ for (o = 0; o < wp->length; o += sc->sectorsize) { arc4rand(d, sc->sectorsize, 0); arc4rand(skey, sizeof skey, 0); AES_makekey(&ki, DIR_ENCRYPT, G_BDE_SKEYBITS, skey); AES_encrypt(&ci, &ki, d, d, sc->sectorsize); d += sc->sectorsize; } /* * Having written a long random sequence to disk here, we want to * force a reseed, to avoid weakening the next time we use random * data for something important. */ arc4rand(&o, sizeof o, 1); } /* * Calculate the total payload size of the encrypted device. * * Security objectives: none. * * This function needs to agree with g_bde_map_sector() about things. */ uint64_t g_bde_max_sector(struct g_bde_key *kp) { uint64_t maxsect; maxsect = kp->media_width; maxsect /= kp->zone_width; maxsect *= kp->zone_cont; return (maxsect); } /* * Convert an unencrypted side offset to offsets on the encrypted side. * * Security objective: Make it harder to identify what sectors contain what * on a "cold" disk image. * * We do this by adding the "keyoffset" from the lock to the physical sector * number modulus the available number of sectors. Since all physical sectors * presumably look the same cold, this will do. * * As part of the mapping we have to skip the lock sectors which we know * the physical address off. We also truncate the work packet, respecting * zone boundaries and lock sectors, so that we end up with a sequence of * sectors which are physically contiguous. * * Shuffling things further is an option, but the incremental frustration is * not currently deemed worth the run-time performance hit resulting from the * increased number of disk arm movements it would incur. * * This function offers nothing but a trivial diversion for an attacker able * to do "the cleaning lady attack" in its current static mapping form. */ void g_bde_map_sector(struct g_bde_work *wp) { u_int zone, zoff, u, len; uint64_t ko; struct g_bde_softc *sc; struct g_bde_key *kp; sc = wp->softc; kp = &sc->key; /* find which zone and the offset in it */ zone = wp->offset / kp->zone_cont; zoff = wp->offset % kp->zone_cont; /* Calculate the offset of the key in the key sector */ wp->ko = (zoff / kp->sectorsize) * G_BDE_SKEYLEN; /* restrict length to that zone */ len = kp->zone_cont - zoff; /* ... and in general */ if (len > DFLTPHYS) len = DFLTPHYS; if (len < wp->length) wp->length = len; /* Find physical sector address */ wp->so = zone * kp->zone_width + zoff; wp->so += kp->keyoffset; wp->so %= kp->media_width; if (wp->so + wp->length > kp->media_width) wp->length = kp->media_width - wp->so; wp->so += kp->sector0; /* The key sector is the last in this zone. */ wp->kso = zone * kp->zone_width + kp->zone_cont; wp->kso += kp->keyoffset; wp->kso %= kp->media_width; wp->kso += kp->sector0; /* Compensate for lock sectors */ for (u = 0; u < G_BDE_MAXKEYS; u++) { /* Find the start of this lock sector */ ko = rounddown2(kp->lsector[u], (uint64_t)kp->sectorsize); if (wp->kso >= ko) wp->kso += kp->sectorsize; if (wp->so >= ko) { /* lock sector before work packet */ wp->so += kp->sectorsize; } else if ((wp->so + wp->length) > ko) { /* lock sector in work packet, truncate */ wp->length = ko - wp->so; } } #if 0 printf("off %jd len %jd so %jd ko %jd kso %u\n", (intmax_t)wp->offset, (intmax_t)wp->length, (intmax_t)wp->so, (intmax_t)wp->kso, wp->ko); #endif KASSERT(wp->so + wp->length <= kp->sectorN, ("wp->so (%jd) + wp->length (%jd) > EOM (%jd), offset = %jd", (intmax_t)wp->so, (intmax_t)wp->length, (intmax_t)kp->sectorN, (intmax_t)wp->offset)); KASSERT(wp->kso + kp->sectorsize <= kp->sectorN, ("wp->kso (%jd) + kp->sectorsize > EOM (%jd), offset = %jd", (intmax_t)wp->kso, (intmax_t)kp->sectorN, (intmax_t)wp->offset)); KASSERT(wp->so >= kp->sector0, ("wp->so (%jd) < BOM (%jd), offset = %jd", (intmax_t)wp->so, (intmax_t)kp->sector0, (intmax_t)wp->offset)); KASSERT(wp->kso >= kp->sector0, ("wp->kso (%jd) kso, (intmax_t)kp->sector0, (intmax_t)wp->offset)); } Index: head/sys/geom/bde/g_bde_lock.c =================================================================== --- head/sys/geom/bde/g_bde_lock.c (revision 326269) +++ head/sys/geom/bde/g_bde_lock.c (revision 326270) @@ -1,478 +1,480 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* This souce file contains routines which operates on the lock sectors, both * for the kernel and the userland program gbde(1). * */ #include #include #include #include #include #include #ifdef _KERNEL #include #include #else #include #define CTASSERT(foo) #define KASSERT(foo, bar) do { if(!(foo)) { warn bar ; exit (1); } } while (0) #include #include #include #include #define g_free(foo) free(foo) #endif #include #include #include #include /* * Hash the raw pass-phrase. * * Security objectives: produce from the pass-phrase a fixed length * bytesequence with PRN like properties in a reproducible way retaining * as much entropy from the pass-phrase as possible. * * SHA2-512 makes this easy. */ void g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len) { SHA512_CTX cx; SHA512_Init(&cx); SHA512_Update(&cx, input, len); SHA512_Final(sc->sha2, &cx); } /* * Encode/Decode the lock structure in byte-sequence format. * * Security objectives: Store in pass-phrase dependent variant format. * * C-structure packing and byte-endianess depends on architecture, compiler * and compiler options. Writing raw structures to disk is therefore a bad * idea in these enlightend days. * * We spend a fraction of the key-material on shuffling the fields around * so they will be stored in an unpredictable sequence. * * For each byte of the key-material we derive two field indexes, and swap * the position of those two fields. * * I have not worked out the statistical properties of this shuffle, but * given that the key-material has PRN properties, the primary objective * of making it hard to figure out which bits are where in the lock sector * is sufficiently fulfilled. * * We include (and shuffle) an extra hash field in the stored version for * identification and versioning purposes. This field contains the MD5 hash * of a version identifier (currently "0000") followed by the stored lock * sector byte-sequence substituting zero bytes for the hash field. * * The stored keysequence is protected by AES/256/CBC elsewhere in the code * so the fact that the generated byte sequence has a much higher than * average density of zero bits (from the numeric fields) is not currently * a concern. * * Should this later become a concern, a simple software update and * pass-phrase change can remedy the situation. One possible solution * could be to XOR the numeric fields with a key-material derived PRN. * * The chosen shuffle algorithm only works as long as we have no more than 16 * fields in the stored part of the lock structure (hence the CTASSERT below). */ CTASSERT(NLOCK_FIELDS <= 16); static void g_bde_shuffle_lock(u_char *sha2, int *buf) { int j, k, l; u_int u; /* Assign the fields sequential positions */ for(u = 0; u < NLOCK_FIELDS; u++) buf[u] = u; /* Then mix it all up */ for(u = 48; u < SHA512_DIGEST_LENGTH; u++) { j = sha2[u] % NLOCK_FIELDS; k = (sha2[u] / NLOCK_FIELDS) % NLOCK_FIELDS; l = buf[j]; buf[j] = buf[k]; buf[k] = l; } } int g_bde_encode_lock(u_char *sha2, struct g_bde_key *gl, u_char *ptr) { int shuffle[NLOCK_FIELDS]; u_char *hash, *p; int i; MD5_CTX c; p = ptr; hash = NULL; g_bde_shuffle_lock(sha2, shuffle); for (i = 0; i < NLOCK_FIELDS; i++) { switch(shuffle[i]) { case 0: le64enc(p, gl->sector0); p += 8; break; case 1: le64enc(p, gl->sectorN); p += 8; break; case 2: le64enc(p, gl->keyoffset); p += 8; break; case 3: le32enc(p, gl->sectorsize); p += 4; break; case 4: le32enc(p, gl->flags); p += 4; break; case 5: case 6: case 7: case 8: le64enc(p, gl->lsector[shuffle[i] - 5]); p += 8; break; case 9: bcopy(gl->spare, p, sizeof gl->spare); p += sizeof gl->spare; break; case 10: bcopy(gl->salt, p, sizeof gl->salt); p += sizeof gl->salt; break; case 11: bcopy(gl->mkey, p, sizeof gl->mkey); p += sizeof gl->mkey; break; case 12: bzero(p, 16); hash = p; p += 16; break; } } if(ptr + G_BDE_LOCKSIZE != p) return(-1); if (hash == NULL) return(-1); MD5Init(&c); MD5Update(&c, "0000", 4); /* Versioning */ MD5Update(&c, ptr, G_BDE_LOCKSIZE); MD5Final(hash, &c); return(0); } int g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr) { int shuffle[NLOCK_FIELDS]; u_char *p; u_char hash[16], hash2[16]; MD5_CTX c; int i; p = ptr; g_bde_shuffle_lock(sc->sha2, shuffle); for (i = 0; i < NLOCK_FIELDS; i++) { switch(shuffle[i]) { case 0: gl->sector0 = le64dec(p); p += 8; break; case 1: gl->sectorN = le64dec(p); p += 8; break; case 2: gl->keyoffset = le64dec(p); p += 8; break; case 3: gl->sectorsize = le32dec(p); p += 4; break; case 4: gl->flags = le32dec(p); p += 4; break; case 5: case 6: case 7: case 8: gl->lsector[shuffle[i] - 5] = le64dec(p); p += 8; break; case 9: bcopy(p, gl->spare, sizeof gl->spare); p += sizeof gl->spare; break; case 10: bcopy(p, gl->salt, sizeof gl->salt); p += sizeof gl->salt; break; case 11: bcopy(p, gl->mkey, sizeof gl->mkey); p += sizeof gl->mkey; break; case 12: bcopy(p, hash2, sizeof hash2); bzero(p, sizeof hash2); p += sizeof hash2; break; } } if(ptr + G_BDE_LOCKSIZE != p) return(-1); MD5Init(&c); MD5Update(&c, "0000", 4); /* Versioning */ MD5Update(&c, ptr, G_BDE_LOCKSIZE); MD5Final(hash, &c); if (bcmp(hash, hash2, sizeof hash2)) return (1); return (0); } /* * Encode/Decode the locksector address ("metadata") with key-material. * * Security objectives: Encode/Decode the metadata encrypted by key-material. * * A simple AES/128/CBC will do. We take care to always store the metadata * in the same endianness to make it MI. * * In the typical case the metadata is stored in encrypted format in sector * zero on the media, but at the users discretion or if the piece of the * device used (sector0...sectorN) does not contain sector zero, it can * be stored in a filesystem or on a PostIt. * * The inability to easily locate the lock sectors makes an attack on a * cold disk much less attractive, without unduly inconveniencing the * legitimate user who can feasibly do a brute-force scan if the metadata * was lost. */ int g_bde_keyloc_encrypt(u_char *sha2, uint64_t v0, uint64_t v1, void *output) { u_char buf[16]; keyInstance ki; cipherInstance ci; le64enc(buf, v0); le64enc(buf + 8, v1); AES_init(&ci); AES_makekey(&ki, DIR_ENCRYPT, G_BDE_KKEYBITS, sha2 + 0); AES_encrypt(&ci, &ki, buf, output, sizeof buf); bzero(buf, sizeof buf); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); return (0); } int g_bde_keyloc_decrypt(u_char *sha2, void *input, uint64_t *output) { keyInstance ki; cipherInstance ci; u_char buf[16]; AES_init(&ci); AES_makekey(&ki, DIR_DECRYPT, G_BDE_KKEYBITS, sha2 + 0); AES_decrypt(&ci, &ki, input, buf, sizeof buf); *output = le64dec(buf); bzero(buf, sizeof buf); bzero(&ci, sizeof ci); bzero(&ki, sizeof ki); return(0); } /* * Find and Encode/Decode lock sectors. * * Security objective: given the pass-phrase, find, decrypt, decode and * validate the lock sector contents. * * For ondisk metadata we cannot know beforehand which of the lock sectors * a given pass-phrase opens so we must try each of the metadata copies in * sector zero in turn. If metadata was passed as an argument, we don't * have this problem. * */ static int g_bde_decrypt_lockx(struct g_bde_softc *sc, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey) { u_char *buf, *q; struct g_bde_key *gl; uint64_t off, q1; int error, m, i; keyInstance ki; cipherInstance ci; gl = &sc->key; /* Try to decrypt the metadata */ error = g_bde_keyloc_decrypt(sc->sha2, meta, &off); if (error) return (error); /* If it points into thin blue air, forget it */ if (off + G_BDE_LOCKSIZE > (uint64_t)mediasize) { off = 0; return (EINVAL); } /* The lock data may span two physical sectors. */ m = 1; if (off % sectorsize > sectorsize - G_BDE_LOCKSIZE) m++; /* Read the suspected sector(s) */ buf = g_read_data(sc->consumer, off - (off % sectorsize), m * sectorsize, &error); if (buf == NULL) { off = 0; return(error); } /* Find the byte-offset of the stored byte sequence */ q = buf + off % sectorsize; /* If it is all zero, somebody nuked our lock sector */ q1 = 0; for (i = 0; i < G_BDE_LOCKSIZE; i++) q1 += q[i]; if (q1 == 0) { off = 0; g_free(buf); return (ESRCH); } /* Decrypt the byte-sequence in place */ AES_init(&ci); AES_makekey(&ki, DIR_DECRYPT, 256, sc->sha2 + 16); AES_decrypt(&ci, &ki, q, q, G_BDE_LOCKSIZE); /* Decode the byte-sequence */ i = g_bde_decode_lock(sc, gl, q); q = NULL; if (i < 0) { off = 0; return (EDOOFUS); /* Programming error */ } else if (i > 0) { off = 0; return (ENOTDIR); /* Hash didn't match */ } bzero(buf, sectorsize * m); g_free(buf); /* If the masterkey is all zeros, user destroyed it */ q1 = 0; for (i = 0; i < (int)sizeof(gl->mkey); i++) q1 += gl->mkey[i]; if (q1 == 0) return (ENOENT); /* If we have an unsorted lock-sequence, refuse */ for (i = 0; i < G_BDE_MAXKEYS - 1; i++) if (gl->lsector[i] >= gl->lsector[i + 1]) return (EINVAL); /* Finally, find out which key was used by matching the byte offset */ for (i = 0; i < G_BDE_MAXKEYS; i++) if (nkey != NULL && off == gl->lsector[i]) *nkey = i; off = 0; return (0); } int g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey) { u_char *buf, buf1[16]; int error, e, i; /* set up the key-material */ bcopy(keymat, sc->sha2, SHA512_DIGEST_LENGTH); /* If passed-in metadata is non-zero, use it */ bzero(buf1, sizeof buf1); if (meta != NULL && bcmp(buf1, meta, sizeof buf1)) return (g_bde_decrypt_lockx(sc, meta, mediasize, sectorsize, nkey)); /* Read sector zero */ buf = g_read_data(sc->consumer, 0, sectorsize, &error); if (buf == NULL) return(error); /* Try each index in turn, save indicative errors for final result */ error = EINVAL; for (i = 0; i < G_BDE_MAXKEYS; i++) { e = g_bde_decrypt_lockx(sc, buf + i * 16, mediasize, sectorsize, nkey); /* Success or destroyed master key terminates */ if (e == 0 || e == ENOENT) { error = e; break; } if (e != 0 && error == EINVAL) error = e; } g_free(buf); return (error); } Index: head/sys/geom/bde/g_bde_work.c =================================================================== --- head/sys/geom/bde/g_bde_work.c (revision 326269) +++ head/sys/geom/bde/g_bde_work.c (revision 326270) @@ -1,764 +1,766 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * This source file contains the state-engine which makes things happen in the * right order. * * Outline: * 1) g_bde_start1() * Break the struct bio into multiple work packets one per zone. * 2) g_bde_start2() * Setup the necessary sector buffers and start those read operations * which we can start at this time and put the item on the work-list. * 3) g_bde_worker() * Scan the work-list for items which are ready for crypto processing * and call the matching crypto function in g_bde_crypt.c and schedule * any writes needed. Read operations finish here by releasing the * sector buffers and delivering the original bio request. * 4) g_bde_write_done() * Release sector buffers and deliver the original bio request. * * Because of the C-scope rules, the functions are almost perfectly in the * opposite order in this source file. * * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add * XXX: additional states to this state-engine. Since no hardware available * XXX: at this time has AES support, implementing this has been postponed * XXX: until such time as it would result in a benefit. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp); static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len); static void g_bde_release_keysector(struct g_bde_work *wp); static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp); static int g_bde_start_read(struct g_bde_sector *sp); static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction); /* * Work item allocation. * * C++ would call these constructors and destructors. */ static u_int g_bde_nwork; SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, ""); static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures"); static struct g_bde_work * g_bde_new_work(struct g_bde_softc *sc) { struct g_bde_work *wp; wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO); if (wp == NULL) return (wp); wp->state = SETUP; wp->softc = sc; g_bde_nwork++; sc->nwork++; TAILQ_INSERT_TAIL(&sc->worklist, wp, list); return (wp); } static void g_bde_delete_work(struct g_bde_work *wp) { struct g_bde_softc *sc; sc = wp->softc; g_bde_nwork--; sc->nwork--; TAILQ_REMOVE(&sc->worklist, wp, list); free(wp, M_GBDE); } /* * Sector buffer allocation * * These two functions allocate and free back variable sized sector buffers */ static u_int g_bde_nsect; SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, ""); static void g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp) { g_bde_nsect--; sc->nsect--; if (sp->malloc) free(sp->data, M_GBDE); free(sp, M_GBDE); } static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len) { struct g_bde_sector *sp; sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO); if (sp == NULL) return (sp); if (len > 0) { sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO); if (sp->data == NULL) { free(sp, M_GBDE); return (NULL); } sp->malloc = 1; } g_bde_nsect++; wp->softc->nsect++; sp->size = len; sp->softc = wp->softc; sp->ref = 1; sp->owner = wp; sp->offset = wp->so; sp->state = JUNK; return (sp); } /* * Skey sector cache. * * Nothing prevents two separate I/O requests from addressing the same zone * and thereby needing the same skey sector. We therefore need to sequence * I/O operations to the skey sectors. A certain amount of caching is also * desirable, although the extent of benefit from this is not at this point * determined. * * XXX: GEOM may be able to grow a generic caching facility at some point * XXX: to support such needs. */ static u_int g_bde_ncache; SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, ""); static void g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp) { g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp); if (sp->ref != 0) return; TAILQ_REMOVE(&sc->freelist, sp, list); g_bde_ncache--; sc->ncache--; bzero(sp->data, sp->size); g_bde_delete_sector(sc, sp); } static struct g_bde_sector * g_bde_get_keysector(struct g_bde_work *wp) { struct g_bde_sector *sp; struct g_bde_softc *sc; off_t offset; offset = wp->kso; g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset); sc = wp->softc; if (malloc_last_fail() < g_bde_ncache) g_bde_purge_sector(sc, -1); sp = TAILQ_FIRST(&sc->freelist); if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime) g_bde_purge_one_sector(sc, sp); TAILQ_FOREACH(sp, &sc->freelist, list) { if (sp->offset == offset) break; } if (sp != NULL) { sp->ref++; KASSERT(sp->offset == offset, ("wrong offset")); KASSERT(sp->softc == wp->softc, ("wrong softc")); if (sp->ref == 1) sp->owner = wp; } else { if (malloc_last_fail() < g_bde_ncache) { TAILQ_FOREACH(sp, &sc->freelist, list) if (sp->ref == 0) break; } if (sp == NULL && !TAILQ_EMPTY(&sc->freelist)) sp = TAILQ_FIRST(&sc->freelist); if (sp != NULL && sp->ref > 0) sp = NULL; if (sp == NULL) { sp = g_bde_new_sector(wp, sc->sectorsize); if (sp != NULL) { g_bde_ncache++; sc->ncache++; TAILQ_INSERT_TAIL(&sc->freelist, sp, list); sp->malloc = 2; } } if (sp != NULL) { sp->offset = offset; sp->softc = wp->softc; sp->ref = 1; sp->owner = wp; sp->state = JUNK; sp->error = 0; } } if (sp != NULL) { TAILQ_REMOVE(&sc->freelist, sp, list); TAILQ_INSERT_TAIL(&sc->freelist, sp, list); sp->used = time_uptime; } wp->ksp = sp; return(sp); } static void g_bde_release_keysector(struct g_bde_work *wp) { struct g_bde_softc *sc; struct g_bde_work *wp2; struct g_bde_sector *sp; sp = wp->ksp; g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp); KASSERT(sp->malloc == 2, ("Wrong sector released")); sc = sp->softc; KASSERT(sc != NULL, ("NULL sp->softc")); KASSERT(wp == sp->owner, ("Releasing, not owner")); sp->owner = NULL; wp->ksp = NULL; sp->ref--; if (sp->ref > 0) { TAILQ_REMOVE(&sc->freelist, sp, list); TAILQ_INSERT_TAIL(&sc->freelist, sp, list); TAILQ_FOREACH(wp2, &sc->worklist, list) { if (wp2->ksp == sp) { KASSERT(wp2 != wp, ("Self-reowning")); sp->owner = wp2; wakeup(sp->softc); break; } } KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp)); } else if (sp->error != 0) { sp->offset = ~0; sp->error = 0; sp->state = JUNK; } TAILQ_REMOVE(&sc->freelist, sp, list); TAILQ_INSERT_HEAD(&sc->freelist, sp, list); } static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction) { struct g_bde_sector *sp; int n; g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc); if (fraction > 0) n = sc->ncache / fraction + 1; else n = g_bde_ncache - malloc_last_fail(); if (n < 0) return; if (n > sc->ncache) n = sc->ncache; while(n--) { TAILQ_FOREACH(sp, &sc->freelist, list) { if (sp->ref != 0) continue; TAILQ_REMOVE(&sc->freelist, sp, list); g_bde_ncache--; sc->ncache--; bzero(sp->data, sp->size); g_bde_delete_sector(sc, sp); break; } } } static struct g_bde_sector * g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp) { struct g_bde_sector *sp; g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp); sp = g_bde_get_keysector(wp); if (sp == NULL) { g_bde_purge_sector(sc, -1); sp = g_bde_get_keysector(wp); } if (sp == NULL) return (sp); if (sp->owner != wp) return (sp); if (sp->state == VALID) return (sp); if (g_bde_start_read(sp) == 0) return (sp); g_bde_release_keysector(wp); return (NULL); } /* * Contribute to the completion of the original bio request. * * We have no simple way to tell how many bits the original bio request has * been segmented into, so the easiest way to determine when we can deliver * it is to keep track of the number of bytes we have completed. We keep * track of any errors underway and latch onto the first one. * * We always report "nothing done" in case of error, because random bits here * and there may be completed and returning a number of completed bytes does * not convey any useful information about which bytes they were. If some * piece of broken code somewhere interprets this to mean that nothing has * changed on the underlying media they deserve the lossage headed for them. * * A single mutex per g_bde instance is used to prevent contention. */ static void g_bde_contribute(struct bio *bp, off_t bytes, int error) { g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d", bp, (intmax_t)bytes, error); if (bp->bio_error == 0) bp->bio_error = error; bp->bio_completed += bytes; KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution")); if (bp->bio_completed == bp->bio_length) { if (bp->bio_error != 0) bp->bio_completed = 0; g_io_deliver(bp, bp->bio_error); } } /* * This is the common case "we're done with this work package" function */ static void g_bde_work_done(struct g_bde_work *wp, int error) { g_bde_contribute(wp->bp, wp->length, error); if (wp->sp != NULL) g_bde_delete_sector(wp->softc, wp->sp); if (wp->ksp != NULL) g_bde_release_keysector(wp); g_bde_delete_work(wp); } /* * A write operation has finished. When we have all expected cows in the * barn close the door and call it a day. */ static void g_bde_write_done(struct bio *bp) { struct g_bde_sector *sp; struct g_bde_work *wp; struct g_bde_softc *sc; sp = bp->bio_caller1; sc = bp->bio_caller2; mtx_lock(&sc->worklist_mutex); KASSERT(sp != NULL, ("NULL sp")); KASSERT(sc != NULL, ("NULL sc")); KASSERT(sp->owner != NULL, ("NULL sp->owner")); g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp); if (bp->bio_error == 0 && bp->bio_completed != sp->size) bp->bio_error = EIO; sp->error = bp->bio_error; g_destroy_bio(bp); wp = sp->owner; if (wp->error == 0) wp->error = sp->error; if (wp->bp->bio_cmd == BIO_DELETE) { KASSERT(sp == wp->sp, ("trashed delete op")); g_bde_work_done(wp, wp->error); mtx_unlock(&sc->worklist_mutex); return; } KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()")); KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op")); if (wp->sp == sp) { g_bde_delete_sector(sc, wp->sp); wp->sp = NULL; } else { sp->state = VALID; } if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID) g_bde_work_done(wp, wp->error); mtx_unlock(&sc->worklist_mutex); return; } /* * Send a write request for the given sector down the pipeline. */ static int g_bde_start_write(struct g_bde_sector *sp) { struct bio *bp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp); sc = sp->softc; KASSERT(sc != NULL, ("NULL sc in g_bde_start_write")); KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write")); bp = g_new_bio(); if (bp == NULL) return (ENOMEM); bp->bio_cmd = BIO_WRITE; bp->bio_offset = sp->offset; bp->bio_data = sp->data; bp->bio_length = sp->size; bp->bio_done = g_bde_write_done; bp->bio_caller1 = sp; bp->bio_caller2 = sc; sp->state = IO; g_io_request(bp, sc->consumer); return(0); } /* * A read operation has finished. Mark the sector no longer iobusy and * wake up the worker thread and let it do its thing. */ static void g_bde_read_done(struct bio *bp) { struct g_bde_sector *sp; struct g_bde_softc *sc; sp = bp->bio_caller1; g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp); sc = bp->bio_caller2; mtx_lock(&sc->worklist_mutex); if (bp->bio_error == 0 && bp->bio_completed != sp->size) bp->bio_error = EIO; sp->error = bp->bio_error; if (sp->error == 0) sp->state = VALID; else sp->state = JUNK; wakeup(sc); g_destroy_bio(bp); mtx_unlock(&sc->worklist_mutex); } /* * Send a read request for the given sector down the pipeline. */ static int g_bde_start_read(struct g_bde_sector *sp) { struct bio *bp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp); sc = sp->softc; KASSERT(sc != NULL, ("Null softc in sp %p", sp)); bp = g_new_bio(); if (bp == NULL) return (ENOMEM); bp->bio_cmd = BIO_READ; bp->bio_offset = sp->offset; bp->bio_data = sp->data; bp->bio_length = sp->size; bp->bio_done = g_bde_read_done; bp->bio_caller1 = sp; bp->bio_caller2 = sc; sp->state = IO; g_io_request(bp, sc->consumer); return(0); } /* * The worker thread. * * The up/down path of GEOM is not allowed to sleep or do any major work * so we use this thread to do the actual crypto operations and to push * the state engine onwards. * * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption * XXX: using a thread here is probably not needed. */ void g_bde_worker(void *arg) { struct g_bde_softc *sc; struct g_bde_work *wp, *twp; struct g_geom *gp; int restart, error; gp = arg; sc = gp->softc; mtx_lock(&sc->worklist_mutex); for (;;) { restart = 0; g_trace(G_T_TOPOLOGY, "g_bde_worker scan"); TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) { KASSERT(wp != NULL, ("NULL wp")); KASSERT(wp->softc != NULL, ("NULL wp->softc")); if (wp->state != WAIT) continue; /* Not interesting here */ KASSERT(wp->bp != NULL, ("NULL wp->bp")); KASSERT(wp->sp != NULL, ("NULL wp->sp")); if (wp->ksp != NULL) { if (wp->ksp->owner != wp) continue; if (wp->ksp->state == IO) continue; KASSERT(wp->ksp->state == VALID, ("Illegal sector state (%d)", wp->ksp->state)); } if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO) continue; if (wp->ksp != NULL && wp->ksp->error != 0) { g_bde_work_done(wp, wp->ksp->error); continue; } switch(wp->bp->bio_cmd) { case BIO_READ: if (wp->ksp == NULL) { KASSERT(wp->error != 0, ("BIO_READ, no ksp and no error")); g_bde_work_done(wp, wp->error); break; } if (wp->sp->error != 0) { g_bde_work_done(wp, wp->sp->error); break; } mtx_unlock(&sc->worklist_mutex); g_bde_crypt_read(wp); mtx_lock(&sc->worklist_mutex); restart++; g_bde_work_done(wp, wp->sp->error); break; case BIO_WRITE: wp->state = FINISH; KASSERT(wp->sp->owner == wp, ("Write not owner sp")); KASSERT(wp->ksp->owner == wp, ("Write not owner ksp")); mtx_unlock(&sc->worklist_mutex); g_bde_crypt_write(wp); mtx_lock(&sc->worklist_mutex); restart++; error = g_bde_start_write(wp->sp); if (error) { g_bde_work_done(wp, error); break; } error = g_bde_start_write(wp->ksp); if (wp->error != 0) wp->error = error; break; case BIO_DELETE: wp->state = FINISH; mtx_unlock(&sc->worklist_mutex); g_bde_crypt_delete(wp); mtx_lock(&sc->worklist_mutex); restart++; g_bde_start_write(wp->sp); break; } if (restart) break; } if (!restart) { /* * We don't look for our death-warrant until we are * idle. Shouldn't make a difference in practice. */ if (sc->dead) break; g_trace(G_T_TOPOLOGY, "g_bde_worker sleep"); error = msleep(sc, &sc->worklist_mutex, PRIBIO, "-", hz); if (error == EWOULDBLOCK) { /* * Lose our skey cache in an orderly fashion. * The exact rate can be tuned to be less * aggressive if this is desirable. 10% per * second means that the cache is gone in a * few minutes. */ g_bde_purge_sector(sc, 10); } } } g_trace(G_T_TOPOLOGY, "g_bde_worker die"); g_bde_purge_sector(sc, 1); KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork)); KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache)); KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect)); mtx_unlock(&sc->worklist_mutex); sc->dead = 2; wakeup(sc); kproc_exit(0); } /* * g_bde_start1 has chopped the incoming request up so all the requests * we see here are inside a single zone. Map the data and key locations * grab the buffers we need and fire off the first volley of read requests. */ static void g_bde_start2(struct g_bde_work *wp) { struct g_bde_softc *sc; KASSERT(wp != NULL, ("NULL wp in g_bde_start2")); KASSERT(wp->softc != NULL, ("NULL wp->softc")); g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp); sc = wp->softc; switch (wp->bp->bio_cmd) { case BIO_READ: wp->sp = g_bde_new_sector(wp, 0); if (wp->sp == NULL) { g_bde_work_done(wp, ENOMEM); return; } wp->sp->size = wp->length; wp->sp->data = wp->data; if (g_bde_start_read(wp->sp) != 0) { g_bde_work_done(wp, ENOMEM); return; } g_bde_read_keysector(sc, wp); if (wp->ksp == NULL) wp->error = ENOMEM; break; case BIO_DELETE: wp->sp = g_bde_new_sector(wp, wp->length); if (wp->sp == NULL) { g_bde_work_done(wp, ENOMEM); return; } break; case BIO_WRITE: wp->sp = g_bde_new_sector(wp, wp->length); if (wp->sp == NULL) { g_bde_work_done(wp, ENOMEM); return; } g_bde_read_keysector(sc, wp); if (wp->ksp == NULL) { g_bde_work_done(wp, ENOMEM); return; } break; default: KASSERT(0 == 1, ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd)); } wp->state = WAIT; wakeup(sc); } /* * Create a sequence of work structures, and have g_bde_map_sector() determine * how long they each can be. Feed them to g_bde_start2(). */ void g_bde_start1(struct bio *bp) { struct g_bde_softc *sc; struct g_bde_work *wp; off_t done; sc = bp->bio_to->geom->softc; bp->bio_driver1 = sc; mtx_lock(&sc->worklist_mutex); for(done = 0; done < bp->bio_length; ) { wp = g_bde_new_work(sc); if (wp != NULL) { wp->bp = bp; wp->offset = bp->bio_offset + done; wp->data = bp->bio_data + done; wp->length = bp->bio_length - done; g_bde_map_sector(wp); done += wp->length; g_bde_start2(wp); } if (wp == NULL || bp->bio_error != 0) { g_bde_contribute(bp, bp->bio_length - done, ENOMEM); break; } } mtx_unlock(&sc->worklist_mutex); return; } Index: head/sys/geom/cache/g_cache.c =================================================================== --- head/sys/geom/cache/g_cache.c (revision 326269) +++ head/sys/geom/cache/g_cache.c (revision 326270) @@ -1,1016 +1,1018 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006 Ruslan Ermilov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_cache, "GEOM cache module"); static MALLOC_DEFINE(M_GCACHE, "gcache_data", "GEOM_CACHE Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, cache, CTLFLAG_RW, 0, "GEOM_CACHE stuff"); static u_int g_cache_debug = 0; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, debug, CTLFLAG_RW, &g_cache_debug, 0, "Debug level"); static u_int g_cache_enable = 1; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, enable, CTLFLAG_RW, &g_cache_enable, 0, ""); static u_int g_cache_timeout = 10; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, timeout, CTLFLAG_RW, &g_cache_timeout, 0, ""); static u_int g_cache_idletime = 5; SYSCTL_UINT(_kern_geom_cache, OID_AUTO, idletime, CTLFLAG_RW, &g_cache_idletime, 0, ""); static u_int g_cache_used_lo = 5; static u_int g_cache_used_hi = 20; static int sysctl_handle_pct(SYSCTL_HANDLER_ARGS) { u_int val = *(u_int *)arg1; int error; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val > 100) return (EINVAL); if ((arg1 == &g_cache_used_lo && val > g_cache_used_hi) || (arg1 == &g_cache_used_hi && g_cache_used_lo > val)) return (EINVAL); *(u_int *)arg1 = val; return (0); } SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_lo, CTLTYPE_UINT|CTLFLAG_RW, &g_cache_used_lo, 0, sysctl_handle_pct, "IU", ""); SYSCTL_PROC(_kern_geom_cache, OID_AUTO, used_hi, CTLTYPE_UINT|CTLFLAG_RW, &g_cache_used_hi, 0, sysctl_handle_pct, "IU", ""); static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force); static g_ctl_destroy_geom_t g_cache_destroy_geom; static g_taste_t g_cache_taste; static g_ctl_req_t g_cache_config; static g_dumpconf_t g_cache_dumpconf; struct g_class g_cache_class = { .name = G_CACHE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_cache_config, .taste = g_cache_taste, .destroy_geom = g_cache_destroy_geom }; #define OFF2BNO(off, sc) ((off) >> (sc)->sc_bshift) #define BNO2OFF(bno, sc) ((bno) << (sc)->sc_bshift) static struct g_cache_desc * g_cache_alloc(struct g_cache_softc *sc) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); if (!TAILQ_EMPTY(&sc->sc_usedlist)) { dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; dp->d_flags = 0; LIST_REMOVE(dp, d_next); return (dp); } if (sc->sc_nent > sc->sc_maxent) { sc->sc_cachefull++; return (NULL); } dp = malloc(sizeof(*dp), M_GCACHE, M_NOWAIT | M_ZERO); if (dp == NULL) return (NULL); dp->d_data = uma_zalloc(sc->sc_zone, M_NOWAIT); if (dp->d_data == NULL) { free(dp, M_GCACHE); return (NULL); } sc->sc_nent++; return (dp); } static void g_cache_free(struct g_cache_softc *sc, struct g_cache_desc *dp) { mtx_assert(&sc->sc_mtx, MA_OWNED); uma_zfree(sc->sc_zone, dp->d_data); free(dp, M_GCACHE); sc->sc_nent--; } static void g_cache_free_used(struct g_cache_softc *sc) { struct g_cache_desc *dp; u_int n; mtx_assert(&sc->sc_mtx, MA_OWNED); n = g_cache_used_lo * sc->sc_maxent / 100; while (sc->sc_nused > n) { KASSERT(!TAILQ_EMPTY(&sc->sc_usedlist), ("used list empty")); dp = TAILQ_FIRST(&sc->sc_usedlist); TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; LIST_REMOVE(dp, d_next); g_cache_free(sc, dp); } } static void g_cache_deliver(struct g_cache_softc *sc, struct bio *bp, struct g_cache_desc *dp, int error) { off_t off1, off, len; mtx_assert(&sc->sc_mtx, MA_OWNED); KASSERT(OFF2BNO(bp->bio_offset, sc) <= dp->d_bno, ("wrong entry")); KASSERT(OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc) >= dp->d_bno, ("wrong entry")); off1 = BNO2OFF(dp->d_bno, sc); off = MAX(bp->bio_offset, off1); len = MIN(bp->bio_offset + bp->bio_length, off1 + sc->sc_bsize) - off; if (bp->bio_error == 0) bp->bio_error = error; if (bp->bio_error == 0) { bcopy(dp->d_data + (off - off1), bp->bio_data + (off - bp->bio_offset), len); } bp->bio_completed += len; KASSERT(bp->bio_completed <= bp->bio_length, ("extra data")); if (bp->bio_completed == bp->bio_length) { if (bp->bio_error != 0) bp->bio_completed = 0; g_io_deliver(bp, bp->bio_error); } if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); } else if (OFF2BNO(off + len, sc) > dp->d_bno) { TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } dp->d_atime = time_uptime; } static void g_cache_done(struct bio *bp) { struct g_cache_softc *sc; struct g_cache_desc *dp; struct bio *bp2, *tmpbp; sc = bp->bio_from->geom->softc; KASSERT(G_CACHE_DESC1(bp) == sc, ("corrupt bio_caller in g_cache_done()")); dp = G_CACHE_DESC2(bp); mtx_lock(&sc->sc_mtx); bp2 = dp->d_biolist; while (bp2 != NULL) { KASSERT(G_CACHE_NEXT_BIO1(bp2) == sc, ("corrupt bio_driver in g_cache_done()")); tmpbp = G_CACHE_NEXT_BIO2(bp2); g_cache_deliver(sc, bp2, dp, bp->bio_error); bp2 = tmpbp; } dp->d_biolist = NULL; if (dp->d_flags & D_FLAG_INVALID) { sc->sc_invalid--; g_cache_free(sc, dp); } else if (bp->bio_error) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } g_cache_free(sc, dp); } mtx_unlock(&sc->sc_mtx); g_destroy_bio(bp); } static struct g_cache_desc * g_cache_lookup(struct g_cache_softc *sc, off_t bno) { struct g_cache_desc *dp; mtx_assert(&sc->sc_mtx, MA_OWNED); LIST_FOREACH(dp, &sc->sc_desclist[G_CACHE_BUCKET(bno)], d_next) if (dp->d_bno == bno) return (dp); return (NULL); } static int g_cache_read(struct g_cache_softc *sc, struct bio *bp) { struct bio *cbp; struct g_cache_desc *dp; mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset + bp->bio_completed, sc)); if (dp != NULL) { /* Add to waiters list or deliver. */ sc->sc_cachehits++; if (dp->d_biolist != NULL) { G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = dp->d_biolist; dp->d_biolist = bp; } else g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); return (0); } /* Cache miss. Allocate entry and schedule bio. */ sc->sc_cachemisses++; dp = g_cache_alloc(sc); if (dp == NULL) { mtx_unlock(&sc->sc_mtx); return (ENOMEM); } cbp = g_clone_bio(bp); if (cbp == NULL) { g_cache_free(sc, dp); mtx_unlock(&sc->sc_mtx); return (ENOMEM); } dp->d_bno = OFF2BNO(bp->bio_offset + bp->bio_completed, sc); G_CACHE_NEXT_BIO1(bp) = sc; G_CACHE_NEXT_BIO2(bp) = NULL; dp->d_biolist = bp; LIST_INSERT_HEAD(&sc->sc_desclist[G_CACHE_BUCKET(dp->d_bno)], dp, d_next); mtx_unlock(&sc->sc_mtx); G_CACHE_DESC1(cbp) = sc; G_CACHE_DESC2(cbp) = dp; cbp->bio_done = g_cache_done; cbp->bio_offset = BNO2OFF(dp->d_bno, sc); cbp->bio_data = dp->d_data; cbp->bio_length = sc->sc_bsize; g_io_request(cbp, LIST_FIRST(&bp->bio_to->geom->consumer)); return (0); } static void g_cache_invalidate(struct g_cache_softc *sc, struct bio *bp) { struct g_cache_desc *dp; off_t bno, lim; mtx_lock(&sc->sc_mtx); bno = OFF2BNO(bp->bio_offset, sc); lim = OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc); do { if ((dp = g_cache_lookup(sc, bno)) != NULL) { LIST_REMOVE(dp, d_next); if (dp->d_flags & D_FLAG_USED) { TAILQ_REMOVE(&sc->sc_usedlist, dp, d_used); sc->sc_nused--; } if (dp->d_biolist == NULL) g_cache_free(sc, dp); else { dp->d_flags = D_FLAG_INVALID; sc->sc_invalid++; } } bno++; } while (bno <= lim); mtx_unlock(&sc->sc_mtx); } static void g_cache_start(struct bio *bp) { struct g_cache_softc *sc; struct g_geom *gp; struct g_cache_desc *dp; struct bio *cbp; gp = bp->bio_to->geom; sc = gp->softc; G_CACHE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: sc->sc_reads++; sc->sc_readbytes += bp->bio_length; if (!g_cache_enable) break; if (bp->bio_offset + bp->bio_length > sc->sc_tail) break; if (OFF2BNO(bp->bio_offset, sc) == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } else if (OFF2BNO(bp->bio_offset, sc) + 1 == OFF2BNO(bp->bio_offset + bp->bio_length - 1, sc)) { mtx_lock(&sc->sc_mtx); dp = g_cache_lookup(sc, OFF2BNO(bp->bio_offset, sc)); if (dp == NULL || dp->d_biolist != NULL) { mtx_unlock(&sc->sc_mtx); break; } sc->sc_cachereads++; sc->sc_cachereadbytes += bp->bio_length; g_cache_deliver(sc, bp, dp, 0); mtx_unlock(&sc->sc_mtx); if (g_cache_read(sc, bp) == 0) return; sc->sc_cachereads--; sc->sc_cachereadbytes -= bp->bio_length; break; } break; case BIO_WRITE: sc->sc_writes++; sc->sc_wrotebytes += bp->bio_length; g_cache_invalidate(sc, bp); break; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; G_CACHE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_cache_go(void *arg) { struct g_cache_softc *sc = arg; struct g_cache_desc *dp; int i; mtx_assert(&sc->sc_mtx, MA_OWNED); /* Forcibly mark idle ready entries as used. */ for (i = 0; i < G_CACHE_BUCKETS; i++) { LIST_FOREACH(dp, &sc->sc_desclist[i], d_next) { if (dp->d_flags & D_FLAG_USED || dp->d_biolist != NULL || time_uptime - dp->d_atime < g_cache_idletime) continue; TAILQ_INSERT_TAIL(&sc->sc_usedlist, dp, d_used); sc->sc_nused++; dp->d_flags |= D_FLAG_USED; } } /* Keep the number of used entries low. */ if (sc->sc_nused > g_cache_used_hi * sc->sc_maxent / 100) g_cache_free_used(sc); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); } static int g_cache_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_cache_orphan(struct g_consumer *cp) { g_topology_assert(); g_cache_destroy(cp->geom->softc, 1); } static struct g_cache_softc * g_cache_find_device(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp->softc); } return (NULL); } static struct g_geom * g_cache_create(struct g_class *mp, struct g_provider *pp, const struct g_cache_metadata *md, u_int type) { struct g_cache_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; u_int bshift; int i; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; G_CACHE_DEBUG(1, "Creating device %s.", md->md_name); /* Cache size is minimum 100. */ if (md->md_size < 100) { G_CACHE_DEBUG(0, "Invalid size for device %s.", md->md_name); return (NULL); } /* Block size restrictions. */ bshift = ffs(md->md_bsize) - 1; if (md->md_bsize == 0 || md->md_bsize > MAXPHYS || md->md_bsize != 1 << bshift || (md->md_bsize % pp->sectorsize) != 0) { G_CACHE_DEBUG(0, "Invalid blocksize for provider %s.", pp->name); return (NULL); } /* Check for duplicate unit. */ if (g_cache_find_device(mp, (const char *)&md->md_name) != NULL) { G_CACHE_DEBUG(0, "Provider %s already exists.", md->md_name); return (NULL); } gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_type = type; sc->sc_bshift = bshift; sc->sc_bsize = 1 << bshift; sc->sc_zone = uma_zcreate("gcache", sc->sc_bsize, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sc->sc_mtx, "GEOM CACHE mutex", NULL, MTX_DEF); for (i = 0; i < G_CACHE_BUCKETS; i++) LIST_INIT(&sc->sc_desclist[i]); TAILQ_INIT(&sc->sc_usedlist); sc->sc_maxent = md->md_size; callout_init_mtx(&sc->sc_callout, &sc->sc_mtx, 0); gp->softc = sc; sc->sc_geom = gp; gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; gp->dumpconf = g_cache_dumpconf; newpp = g_new_providerf(gp, "cache/%s", gp->name); newpp->sectorsize = pp->sectorsize; newpp->mediasize = pp->mediasize; if (type == G_CACHE_TYPE_AUTOMATIC) newpp->mediasize -= pp->sectorsize; sc->sc_tail = BNO2OFF(OFF2BNO(newpp->mediasize, sc), sc); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { G_CACHE_DEBUG(0, "Cannot attach to provider %s.", pp->name); g_destroy_consumer(cp); g_destroy_provider(newpp); mtx_destroy(&sc->sc_mtx); g_free(sc); g_destroy_geom(gp); return (NULL); } g_error_provider(newpp, 0); G_CACHE_DEBUG(0, "Device %s created.", gp->name); callout_reset(&sc->sc_callout, g_cache_timeout * hz, g_cache_go, sc); return (gp); } static int g_cache_destroy(struct g_cache_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; struct g_cache_desc *dp, *dp2; int i; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CACHE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CACHE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_CACHE_DEBUG(0, "Device %s removed.", gp->name); } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); for (i = 0; i < G_CACHE_BUCKETS; i++) { dp = LIST_FIRST(&sc->sc_desclist[i]); while (dp != NULL) { dp2 = LIST_NEXT(dp, d_next); g_cache_free(sc, dp); dp = dp2; } } mtx_unlock(&sc->sc_mtx); mtx_destroy(&sc->sc_mtx); uma_zdestroy(sc->sc_zone); g_free(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_cache_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_cache_destroy(gp->softc, 0)); } static int g_cache_read_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ cache_metadata_decode(buf, md); g_free(buf); return (0); } static int g_cache_write_metadata(struct g_consumer *cp, struct g_cache_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) return (error); pp = cp->provider; buf = malloc((size_t)pp->sectorsize, M_GCACHE, M_WAITOK | M_ZERO); cache_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GCACHE); return (error); } static struct g_geom * g_cache_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_cache_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_CACHE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "cache:taste"); gp->start = g_cache_start; gp->orphan = g_cache_orphan; gp->access = g_cache_access; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_cache_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_CACHE_MAGIC) != 0) return (NULL); if (md.md_version > G_CACHE_VERSION) { printf("geom_cache.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_AUTOMATIC); if (gp == NULL) { G_CACHE_DEBUG(0, "Can't create %s.", md.md_name); return (NULL); } return (gp); } static void g_cache_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_provider *pp; struct g_geom *gp; intmax_t *bsize, *size; const char *name; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } md.md_size = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } md.md_bsize = (u_int)*bsize; /* This field is not important here. */ md.md_provsize = 0; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_CACHE_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } gp = g_cache_create(mp, pp, &md, G_CACHE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't create %s.", md.md_name); return; } } static void g_cache_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_cache_metadata md; struct g_cache_softc *sc; struct g_consumer *cp; intmax_t *bsize, *size; const char *name; int error, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Missing device."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if ((u_int)*size != 0 && (u_int)*size < 100) { gctl_error(req, "Invalid '%s' argument", "size"); return; } if ((u_int)*size != 0) sc->sc_maxent = (u_int)*size; bsize = gctl_get_paraml(req, "blocksize", sizeof(*bsize)); if (bsize == NULL) { gctl_error(req, "No '%s' argument", "blocksize"); return; } if (*bsize < 0) { gctl_error(req, "Invalid '%s' argument", "blocksize"); return; } if (sc->sc_type != G_CACHE_TYPE_AUTOMATIC) return; strlcpy(md.md_name, name, sizeof(md.md_name)); strlcpy(md.md_magic, G_CACHE_MAGIC, sizeof(md.md_magic)); md.md_version = G_CACHE_VERSION; if ((u_int)*size != 0) md.md_size = (u_int)*size; else md.md_size = sc->sc_maxent; if ((u_int)*bsize != 0) md.md_bsize = (u_int)*bsize; else md.md_bsize = sc->sc_bsize; cp = LIST_FIRST(&sc->sc_geom->consumer); md.md_provsize = cp->provider->mediasize; error = g_cache_write_metadata(cp, &md); if (error == 0) G_CACHE_DEBUG(2, "Metadata on %s updated.", cp->provider->name); else G_CACHE_DEBUG(0, "Cannot update metadata on %s (error=%d).", cp->provider->name, error); } static void g_cache_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_cache_softc *sc; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_cache_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_cache_ctl_reset(struct gctl_req *req, struct g_class *mp) { struct g_cache_softc *sc; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } sc = g_cache_find_device(mp, name); if (sc == NULL) { G_CACHE_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } sc->sc_reads = 0; sc->sc_readbytes = 0; sc->sc_cachereads = 0; sc->sc_cachereadbytes = 0; sc->sc_cachehits = 0; sc->sc_cachemisses = 0; sc->sc_cachefull = 0; sc->sc_writes = 0; sc->sc_wrotebytes = 0; } } static void g_cache_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CACHE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_cache_ctl_create(req, mp); return; } else if (strcmp(verb, "configure") == 0) { g_cache_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_cache_ctl_destroy(req, mp); return; } else if (strcmp(verb, "reset") == 0) { g_cache_ctl_reset(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_cache_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_cache_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%u\n", indent, sc->sc_maxent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_bsize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_tail); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nent); sbuf_printf(sb, "%s%u\n", indent, sc->sc_nused); sbuf_printf(sb, "%s%u\n", indent, sc->sc_invalid); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_readbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachereadbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachehits); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachemisses); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cachefull); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_wrotebytes); } DECLARE_GEOM_CLASS(g_cache_class, g_cache); Index: head/sys/geom/cache/g_cache.h =================================================================== --- head/sys/geom/cache/g_cache.h (revision 326269) +++ head/sys/geom/cache/g_cache.h (revision 326270) @@ -1,146 +1,148 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006 Ruslan Ermilov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_CACHE_H_ #define _G_CACHE_H_ #include #define G_CACHE_CLASS_NAME "CACHE" #define G_CACHE_MAGIC "GEOM::CACHE" #define G_CACHE_VERSION 1 #ifdef _KERNEL #define G_CACHE_TYPE_MANUAL 0 #define G_CACHE_TYPE_AUTOMATIC 1 #define G_CACHE_DEBUG(lvl, ...) do { \ if (g_cache_debug >= (lvl)) { \ printf("GEOM_CACHE"); \ if (g_cache_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_CACHE_LOGREQ(bp, ...) do { \ if (g_cache_debug >= 2) { \ printf("GEOM_CACHE[2]: "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #define G_CACHE_BUCKETS (1 << 3) #define G_CACHE_BUCKET(bno) ((bno) & (G_CACHE_BUCKETS - 1)) struct g_cache_softc { struct g_geom *sc_geom; int sc_type; u_int sc_bshift; u_int sc_bsize; off_t sc_tail; struct mtx sc_mtx; struct callout sc_callout; LIST_HEAD(, g_cache_desc) sc_desclist[G_CACHE_BUCKETS]; TAILQ_HEAD(, g_cache_desc) sc_usedlist; uma_zone_t sc_zone; u_int sc_maxent; /* max entries */ u_int sc_nent; /* allocated entries */ u_int sc_nused; /* re-useable entries */ u_int sc_invalid; /* invalid entries */ uintmax_t sc_reads; /* #reads */ uintmax_t sc_readbytes; /* bytes read */ uintmax_t sc_cachereads; /* #reads from cache */ uintmax_t sc_cachereadbytes; /* bytes read from cache */ uintmax_t sc_cachehits; /* cache hits */ uintmax_t sc_cachemisses; /* cache misses */ uintmax_t sc_cachefull; /* #times a cache was full */ uintmax_t sc_writes; /* #writes */ uintmax_t sc_wrotebytes; /* bytes written */ }; #define sc_name sc_geom->name struct g_cache_desc { off_t d_bno; /* block number */ caddr_t d_data; /* data area */ struct bio *d_biolist; /* waiters */ time_t d_atime; /* access time */ int d_flags; /* flags */ #define D_FLAG_USED (1 << 0) /* can be reused */ #define D_FLAG_INVALID (1 << 1) /* invalid */ LIST_ENTRY(g_cache_desc) d_next; /* list */ TAILQ_ENTRY(g_cache_desc) d_used; /* used list */ }; #define G_CACHE_NEXT_BIO1(bp) (bp)->bio_driver1 #define G_CACHE_NEXT_BIO2(bp) (bp)->bio_driver2 #define G_CACHE_DESC1(bp) (bp)->bio_caller1 #define G_CACHE_DESC2(bp) (bp)->bio_caller2 #endif /* _KERNEL */ struct g_cache_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Cache value. */ uint32_t md_bsize; /* Cache block size. */ uint32_t md_size; /* Cache size. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void cache_metadata_encode(const struct g_cache_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_bsize); le32enc(data + 40, md->md_size); le64enc(data + 44, md->md_provsize); } static __inline void cache_metadata_decode(const u_char *data, struct g_cache_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_bsize = le32dec(data + 36); md->md_size = le32dec(data + 40); md->md_provsize = le64dec(data + 44); } #endif /* _G_CACHE_H_ */ Index: head/sys/geom/concat/g_concat.c =================================================================== --- head/sys/geom/concat/g_concat.c (revision 326269) +++ head/sys/geom/concat/g_concat.c (revision 326270) @@ -1,993 +1,995 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_concat, "GEOM concatenation support"); static MALLOC_DEFINE(M_CONCAT, "concat_data", "GEOM_CONCAT Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, concat, CTLFLAG_RW, 0, "GEOM_CONCAT stuff"); static u_int g_concat_debug = 0; SYSCTL_UINT(_kern_geom_concat, OID_AUTO, debug, CTLFLAG_RWTUN, &g_concat_debug, 0, "Debug level"); static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force); static int g_concat_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_concat_taste; static g_ctl_req_t g_concat_config; static g_dumpconf_t g_concat_dumpconf; struct g_class g_concat_class = { .name = G_CONCAT_CLASS_NAME, .version = G_VERSION, .ctlreq = g_concat_config, .taste = g_concat_taste, .destroy_geom = g_concat_destroy_geom }; /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } /* * Return the number of valid disks. */ static u_int g_concat_nvalid(struct g_concat_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_consumer != NULL) no++; } return (no); } static void g_concat_remove_disk(struct g_concat_disk *disk) { struct g_consumer *cp; struct g_concat_softc *sc; g_topology_assert(); KASSERT(disk->d_consumer != NULL, ("Non-valid disk in %s.", __func__)); sc = disk->d_softc; cp = disk->d_consumer; if (!disk->d_removed) { G_CONCAT_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); disk->d_removed = 1; } if (sc->sc_provider != NULL) { G_CONCAT_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; disk->d_consumer = NULL; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_concat_destroy(sc, 1); } static void g_concat_orphan(struct g_consumer *cp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; disk = cp->private; if (disk == NULL) /* Possible? */ return; g_concat_remove_disk(disk); } static int g_concat_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_concat_disk *disk; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; disk = cp1->private; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && disk->d_removed) { g_concat_remove_disk(disk); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_concat_kernel_dump(struct bio *bp) { struct g_concat_softc *sc; struct g_concat_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; u_int i; sc = bp->bio_to->geom->softc; gkd = (struct g_kerneldump *)bp->bio_data; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i].d_start <= gkd->offset && sc->sc_disks[i].d_end > gkd->offset) break; } if (i == sc->sc_ndisks) g_io_deliver(bp, EOPNOTSUPP); disk = &sc->sc_disks[i]; gkd->offset -= disk->d_start; if (gkd->length > disk->d_end - disk->d_start - gkd->offset) gkd->length = disk->d_end - disk->d_start - gkd->offset; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_CONCAT_DEBUG(1, "Kernel dump will go to %s.", disk->d_consumer->provider->name); } static void g_concat_done(struct bio *bp) { struct g_concat_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } static void g_concat_flush(struct g_concat_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_concat_done; cbp->bio_caller1 = sc->sc_disks[no].d_consumer; cbp->bio_to = sc->sc_disks[no].d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, cp); } } static void g_concat_start(struct bio *bp) { struct bio_queue_head queue; struct g_concat_softc *sc; struct g_concat_disk *disk; struct g_provider *pp; off_t offset, end, length, off, len; struct bio *cbp; char *addr; u_int no; pp = bp->bio_to; sc = pp->geom->softc; /* * If sc == NULL, provider's error should be set and g_concat_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_CONCAT_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_concat_flush(sc, bp); return; case BIO_GETATTR: if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_concat_kernel_dump(bp); return; } /* To which provider it should be delivered? */ /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } offset = bp->bio_offset; length = bp->bio_length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; end = offset + length; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; if (disk->d_end <= offset) continue; if (disk->d_start >= end) break; off = offset - disk->d_start; len = MIN(length, disk->d_end - offset); length -= len; offset += len; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); /* * Fill in the component buf structure. */ if (len == bp->bio_length) cbp->bio_done = g_std_done; else cbp->bio_done = g_concat_done; cbp->bio_offset = off; cbp->bio_length = len; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; addr += len; cbp->bio_to = disk->d_consumer->provider; cbp->bio_caller1 = disk; if (length == 0) break; } KASSERT(length == 0, ("Length is still greater than 0 (class=%s, name=%s).", bp->bio_to->geom->class->name, bp->bio_to->geom->name)); while ((cbp = bioq_takefirst(&queue)) != NULL) { G_CONCAT_LOGREQ(cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_io_request(cbp, disk->d_consumer); } } static void g_concat_check_and_run(struct g_concat_softc *sc) { struct g_concat_disk *disk; struct g_provider *dp, *pp; u_int no, sectorsize = 0; off_t start; g_topology_assert(); if (g_concat_nvalid(sc) != sc->sc_ndisks) return; pp = g_new_providerf(sc->sc_geom, "concat/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE | G_PF_ACCEPT_UNMAPPED; start = 0; for (no = 0; no < sc->sc_ndisks; no++) { disk = &sc->sc_disks[no]; dp = disk->d_consumer->provider; disk->d_start = start; disk->d_end = disk->d_start + dp->mediasize; if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) disk->d_end -= dp->sectorsize; start = disk->d_end; if (no == 0) sectorsize = dp->sectorsize; else sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_CONCAT_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } pp->sectorsize = sectorsize; /* We have sc->sc_disks[sc->sc_ndisks - 1].d_end in 'start'. */ pp->mediasize = start; pp->stripesize = sc->sc_disks[0].d_consumer->provider->stripesize; pp->stripeoffset = sc->sc_disks[0].d_consumer->provider->stripeoffset; sc->sc_provider = pp; g_error_provider(pp, 0); G_CONCAT_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_concat_read_metadata(struct g_consumer *cp, struct g_concat_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ concat_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no) { struct g_concat_disk *disk; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); disk = &sc->sc_disks[no]; /* Check if disk is not already attached. */ if (disk->d_consumer != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) { struct g_concat_metadata md; /* Re-read metadata. */ error = g_concat_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } cp->private = disk; disk->d_consumer = cp; disk->d_softc = sc; disk->d_start = 0; /* not yet */ disk->d_end = 0; /* not yet */ disk->d_removed = 0; G_CONCAT_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_concat_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_concat_create(struct g_class *mp, const struct g_concat_metadata *md, u_int type) { struct g_concat_softc *sc; struct g_geom *gp; u_int no; G_CONCAT_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disks is minimum. */ if (md->md_all < 1) return (NULL); /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_CONCAT_DEBUG(0, "Device %s already configured.", gp->name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_CONCAT, M_WAITOK | M_ZERO); gp->start = g_concat_start; gp->spoiled = g_concat_orphan; gp->orphan = g_concat_orphan; gp->access = g_concat_access; gp->dumpconf = g_concat_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_concat_disk) * sc->sc_ndisks, M_CONCAT, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no].d_consumer = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gconcat lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_CONCAT_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_CONCAT_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_CONCAT_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_concat_remove_disk(cp->private); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_CONCAT); mtx_destroy(&sc->sc_lock); free(sc, M_CONCAT); G_CONCAT_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_concat_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_concat_softc *sc; sc = gp->softc; return (g_concat_destroy(sc, 0)); } static struct g_geom * g_concat_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_concat_metadata md; struct g_concat_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_CONCAT_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "concat:taste"); gp->start = g_concat_start; gp->access = g_concat_access; gp->orphan = g_concat_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_concat_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0) return (NULL); if (md.md_version > G_CONCAT_VERSION) { printf("geom_concat.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 3) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 4) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_CONCAT_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_concat_create(mp, &md, G_CONCAT_TYPE_AUTOMATIC); if (gp == NULL) { G_CONCAT_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_concat_add_disk(sc, pp, md.md_no); if (error != 0) { G_CONCAT_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_concat_destroy(sc, 1); return (NULL); } } return (gp); } static void g_concat_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_concat_metadata md; struct g_provider *pp; struct g_concat_softc *sc; struct g_geom *gp; struct sbuf *sb; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_CONCAT_MAGIC, sizeof(md.md_magic)); md.md_version = G_CONCAT_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_CONCAT_DEBUG(1, "Disk %s is invalid.", name); gctl_error(req, "Disk %s is invalid.", name); return; } } gp = g_concat_create(mp, &md, G_CONCAT_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument.", no); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); KASSERT(pp != NULL, ("Provider %s disappear?!", name)); if (g_concat_add_disk(sc, pp, no - 1) != 0) { G_CONCAT_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_concat_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_concat_softc * g_concat_find_device(struct g_class *mp, const char *name) { struct g_concat_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_concat_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_concat_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_concat_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_concat_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_concat_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_CONCAT_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_concat_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_concat_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_concat_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_concat_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_concat_disk *disk; disk = cp->private; if (disk == NULL) return; sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_end); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)disk->d_start); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_CONCAT_TYPE_AUTOMATIC: sbuf_printf(sb, "AUTOMATIC"); break; case G_CONCAT_TYPE_MANUAL: sbuf_printf(sb, "MANUAL"); break; default: sbuf_printf(sb, "UNKNOWN"); break; } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_concat_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_printf(sb, "UP"); else sbuf_printf(sb, "DOWN"); sbuf_printf(sb, "\n"); } } DECLARE_GEOM_CLASS(g_concat_class, g_concat); Index: head/sys/geom/concat/g_concat.h =================================================================== --- head/sys/geom/concat/g_concat.h (revision 326269) +++ head/sys/geom/concat/g_concat.h (revision 326270) @@ -1,127 +1,129 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_CONCAT_H_ #define _G_CONCAT_H_ #include #define G_CONCAT_CLASS_NAME "CONCAT" #define G_CONCAT_MAGIC "GEOM::CONCAT" /* * Version history: * 1 - Initial version number. * 2 - Added 'stop' command to gconcat(8). * 3 - Added md_provider field to metadata and '-h' option to gconcat(8). * 4 - Added md_provsize field to metadata. */ #define G_CONCAT_VERSION 4 #ifdef _KERNEL #define G_CONCAT_TYPE_MANUAL 0 #define G_CONCAT_TYPE_AUTOMATIC 1 #define G_CONCAT_DEBUG(lvl, ...) do { \ if (g_concat_debug >= (lvl)) { \ printf("GEOM_CONCAT"); \ if (g_concat_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_CONCAT_LOGREQ(bp, ...) do { \ if (g_concat_debug >= 2) { \ printf("GEOM_CONCAT[2]: "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) struct g_concat_disk { struct g_consumer *d_consumer; struct g_concat_softc *d_softc; off_t d_start; off_t d_end; int d_removed; }; struct g_concat_softc { u_int sc_type; /* provider type */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* concat unique ID */ struct g_concat_disk *sc_disks; uint16_t sc_ndisks; struct mtx sc_lock; }; #define sc_name sc_geom->name #endif /* _KERNEL */ struct g_concat_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Concat name. */ uint32_t md_id; /* Unique ID. */ uint16_t md_no; /* Disk number. */ uint16_t md_all; /* Number of all disks. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void concat_metadata_encode(const struct g_concat_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); bcopy(md->md_provider, data + 44, sizeof(md->md_provider)); le64enc(data + 60, md->md_provsize); } static __inline void concat_metadata_decode(const u_char *data, struct g_concat_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); bcopy(data + 44, md->md_provider, sizeof(md->md_provider)); md->md_provsize = le64dec(data + 60); } #endif /* _G_CONCAT_H_ */ Index: head/sys/geom/eli/g_eli.c =================================================================== --- head/sys/geom/eli/g_eli.c (revision 326269) +++ head/sys/geom/eli/g_eli.c (revision 326270) @@ -1,1320 +1,1322 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_eli, "GEOM crypto module"); MALLOC_DEFINE(M_ELI, "eli data", "GEOM_ELI Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, eli, CTLFLAG_RW, 0, "GEOM_ELI stuff"); static int g_eli_version = G_ELI_VERSION; SYSCTL_INT(_kern_geom_eli, OID_AUTO, version, CTLFLAG_RD, &g_eli_version, 0, "GELI version"); int g_eli_debug = 0; SYSCTL_INT(_kern_geom_eli, OID_AUTO, debug, CTLFLAG_RWTUN, &g_eli_debug, 0, "Debug level"); static u_int g_eli_tries = 3; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, tries, CTLFLAG_RWTUN, &g_eli_tries, 0, "Number of tries for entering the passphrase"); static u_int g_eli_visible_passphrase = GETS_NOECHO; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, visible_passphrase, CTLFLAG_RWTUN, &g_eli_visible_passphrase, 0, "Visibility of passphrase prompt (0 = invisible, 1 = visible, 2 = asterisk)"); u_int g_eli_overwrites = G_ELI_OVERWRITES; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, overwrites, CTLFLAG_RWTUN, &g_eli_overwrites, 0, "Number of times on-disk keys should be overwritten when destroying them"); static u_int g_eli_threads = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, threads, CTLFLAG_RWTUN, &g_eli_threads, 0, "Number of threads doing crypto work"); u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); /* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. */ static char cached_passphrase[256]; static u_int g_eli_boot_passcache = 1; TUNABLE_INT("kern.geom.eli.boot_passcache", &g_eli_boot_passcache); SYSCTL_UINT(_kern_geom_eli, OID_AUTO, boot_passcache, CTLFLAG_RD, &g_eli_boot_passcache, 0, "Passphrases are cached during boot process for possible reuse"); static void fetch_loader_passphrase(void * dummy) { char * env_passphrase; KASSERT(dynamic_kenv, ("need dynamic kenv")); if ((env_passphrase = kern_getenv("kern.geom.eli.passphrase")) != NULL) { /* Extract passphrase from the environment. */ strlcpy(cached_passphrase, env_passphrase, sizeof(cached_passphrase)); freeenv(env_passphrase); /* Wipe the passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); } } SYSINIT(geli_fetch_loader_passphrase, SI_SUB_KMEM + 1, SI_ORDER_ANY, fetch_loader_passphrase, NULL); static void zero_boot_passcache(void) { explicit_bzero(cached_passphrase, sizeof(cached_passphrase)); } static void zero_geli_intake_keys(void) { struct keybuf *keybuf; int i; if ((keybuf = get_keybuf()) != NULL) { /* Scan the key buffer, clear all GELI keys. */ for (i = 0; i < keybuf->kb_nents; i++) { if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) { explicit_bzero(keybuf->kb_ents[i].ke_data, sizeof(keybuf->kb_ents[i].ke_data)); keybuf->kb_ents[i].ke_type = KEYBUF_TYPE_NONE; } } } } static void zero_intake_passcache(void *dummy) { zero_boot_passcache(); zero_geli_intake_keys(); } EVENTHANDLER_DEFINE(mountroot, zero_intake_passcache, NULL, 0); static eventhandler_tag g_eli_pre_sync = NULL; static int g_eli_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_eli_init(struct g_class *mp); static void g_eli_fini(struct g_class *mp); static g_taste_t g_eli_taste; static g_dumpconf_t g_eli_dumpconf; struct g_class g_eli_class = { .name = G_ELI_CLASS_NAME, .version = G_VERSION, .ctlreq = g_eli_config, .taste = g_eli_taste, .destroy_geom = g_eli_destroy_geom, .init = g_eli_init, .fini = g_eli_fini }; /* * Code paths: * BIO_READ: * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ /* * EAGAIN from crypto(9) means, that we were probably balanced to another crypto * accelerator or something like this. * The function updates the SID and rerun the operation. */ int g_eli_crypto_rerun(struct cryptop *crp) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; int error; bp = (struct bio *)crp->crp_opaque; sc = bp->bio_to->geom->softc; LIST_FOREACH(wr, &sc->sc_workers, w_next) { if (wr->w_number == bp->bio_pflags) break; } KASSERT(wr != NULL, ("Invalid worker (%u).", bp->bio_pflags)); G_ELI_DEBUG(1, "Rerunning crypto %s request (sid: %ju -> %ju).", bp->bio_cmd == BIO_READ ? "READ" : "WRITE", (uintmax_t)wr->w_sid, (uintmax_t)crp->crp_sid); wr->w_sid = crp->crp_sid; crp->crp_etype = 0; error = crypto_dispatch(crp); if (error == 0) return (0); G_ELI_DEBUG(1, "%s: crypto_dispatch() returned %d.", __func__, error); crp->crp_etype = error; return (error); } /* * The function is called afer reading encrypted data from the provider. * * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver */ void g_eli_read_done(struct bio *bp) { struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0 && bp->bio_error != 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); /* * Do we have all sectors already? */ pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; sc = pbp->bio_to->geom->softc; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); pbp->bio_completed = 0; if (pbp->bio_driver2 != NULL) { free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; } g_io_deliver(pbp, pbp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } /* * The function is called after we encrypt and write data. * * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> G_ELI_WRITE_DONE -> g_io_deliver */ void g_eli_write_done(struct bio *bp) { struct g_eli_softc *sc; struct bio *pbp; G_ELI_LOGREQ(2, bp, "Request done."); pbp = bp->bio_parent; if (pbp->bio_error == 0 && bp->bio_error != 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); /* * Do we have all sectors already? */ pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); pbp->bio_completed = 0; } else pbp->bio_completed = pbp->bio_length; /* * Write is finished, send it up. */ sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); } /* * This function should never be called, but GEOM made as it set ->orphan() * method for every geom. */ static void g_eli_orphan_spoil_assert(struct g_consumer *cp) { panic("Function %s() called for %s.", __func__, cp->geom->name); } static void g_eli_orphan(struct g_consumer *cp) { struct g_eli_softc *sc; g_topology_assert(); sc = cp->geom->softc; if (sc == NULL) return; g_eli_destroy(sc, TRUE); } /* * BIO_READ: * G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ static void g_eli_start(struct bio *bp) { struct g_eli_softc *sc; struct g_consumer *cp; struct bio *cbp; sc = bp->bio_to->geom->softc; KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_ELI_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_GETATTR: case BIO_FLUSH: case BIO_ZONE: break; case BIO_DELETE: /* * If the user hasn't set the NODELETE flag, we just pass * it down the stack and let the layers beneath us do (or * not) whatever they do with it. If they have, we * reject it. A possible extension would be an * additional flag to take it as a hint to shred the data * with [multiple?] overwrites. */ if (!(sc->sc_flags & G_ELI_FLAG_NODELETE)) break; default: g_io_deliver(bp, EOPNOTSUPP); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } bp->bio_driver1 = cbp; bp->bio_pflags = G_ELI_NEW_BIO; switch (bp->bio_cmd) { case BIO_READ: if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) { g_eli_crypto_read(sc, bp, 0); break; } /* FALLTHROUGH */ case BIO_WRITE: mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); break; case BIO_GETATTR: case BIO_FLUSH: case BIO_DELETE: case BIO_ZONE: cbp->bio_done = g_std_done; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); break; } } static int g_eli_newsession(struct g_eli_worker *wr) { struct g_eli_softc *sc; struct cryptoini crie, cria; int error; sc = wr->w_softc; bzero(&crie, sizeof(crie)); crie.cri_alg = sc->sc_ealgo; crie.cri_klen = sc->sc_ekeylen; if (sc->sc_ealgo == CRYPTO_AES_XTS) crie.cri_klen <<= 1; if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) { crie.cri_key = g_eli_key_hold(sc, 0, LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize); } else { crie.cri_key = sc->sc_ekey; } if (sc->sc_flags & G_ELI_FLAG_AUTH) { bzero(&cria, sizeof(cria)); cria.cri_alg = sc->sc_aalgo; cria.cri_klen = sc->sc_akeylen; cria.cri_key = sc->sc_akey; crie.cri_next = &cria; } switch (sc->sc_crypto) { case G_ELI_CRYPTO_SW: error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_SOFTWARE); break; case G_ELI_CRYPTO_HW: error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_HARDWARE); break; case G_ELI_CRYPTO_UNKNOWN: error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_HARDWARE); if (error == 0) { mtx_lock(&sc->sc_queue_mtx); if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN) sc->sc_crypto = G_ELI_CRYPTO_HW; mtx_unlock(&sc->sc_queue_mtx); } else { error = crypto_newsession(&wr->w_sid, &crie, CRYPTOCAP_F_SOFTWARE); mtx_lock(&sc->sc_queue_mtx); if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN) sc->sc_crypto = G_ELI_CRYPTO_SW; mtx_unlock(&sc->sc_queue_mtx); } break; default: panic("%s: invalid condition", __func__); } if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) g_eli_key_drop(sc, crie.cri_key); return (error); } static void g_eli_freesession(struct g_eli_worker *wr) { crypto_freesession(wr->w_sid); } static void g_eli_cancel(struct g_eli_softc *sc) { struct bio *bp; mtx_assert(&sc->sc_queue_mtx, MA_OWNED); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) { KASSERT(bp->bio_pflags == G_ELI_NEW_BIO, ("Not new bio when canceling (bp=%p).", bp)); g_io_deliver(bp, ENXIO); } } static struct bio * g_eli_takefirst(struct g_eli_softc *sc) { struct bio *bp; mtx_assert(&sc->sc_queue_mtx, MA_OWNED); if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) return (bioq_takefirst(&sc->sc_queue)); /* * Device suspended, so we skip new I/O requests. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_pflags != G_ELI_NEW_BIO) break; } if (bp != NULL) bioq_remove(&sc->sc_queue, bp); return (bp); } /* * This is the main function for kernel worker thread when we don't have * hardware acceleration and we have to do cryptography in software. * Dedicated thread is needed, so we don't slow down g_up/g_down GEOM * threads with crypto work. */ static void g_eli_worker(void *arg) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct bio *bp; int error; wr = arg; sc = wr->w_softc; #ifdef EARLY_AP_STARTUP MPASS(!sc->sc_cpubind || smp_started); #elif defined(SMP) /* Before sched_bind() to a CPU, wait for all CPUs to go on-line. */ if (sc->sc_cpubind) { while (!smp_started) tsleep(wr, 0, "geli:smp", hz / 4); } #endif thread_lock(curthread); sched_prio(curthread, PUSER); if (sc->sc_cpubind) sched_bind(curthread, wr->w_number % mp_ncpus); thread_unlock(curthread); G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm); for (;;) { mtx_lock(&sc->sc_queue_mtx); again: bp = g_eli_takefirst(sc); if (bp == NULL) { if (sc->sc_flags & G_ELI_FLAG_DESTROY) { g_eli_cancel(sc); LIST_REMOVE(wr, w_next); g_eli_freesession(wr); free(wr, M_ELI); G_ELI_DEBUG(1, "Thread %s exiting.", curthread->td_proc->p_comm); wakeup(&sc->sc_workers); mtx_unlock(&sc->sc_queue_mtx); kproc_exit(0); } while (sc->sc_flags & G_ELI_FLAG_SUSPEND) { if (sc->sc_inflight > 0) { G_ELI_DEBUG(0, "inflight=%d", sc->sc_inflight); /* * We still have inflight BIOs, so * sleep and retry. */ msleep(sc, &sc->sc_queue_mtx, PRIBIO, "geli:inf", hz / 5); goto again; } /* * Suspend requested, mark the worker as * suspended and go to sleep. */ if (wr->w_active) { g_eli_freesession(wr); wr->w_active = FALSE; } wakeup(&sc->sc_workers); msleep(sc, &sc->sc_queue_mtx, PRIBIO, "geli:suspend", 0); if (!wr->w_active && !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) { error = g_eli_newsession(wr); KASSERT(error == 0, ("g_eli_newsession() failed on resume (error=%d)", error)); wr->w_active = TRUE; } goto again; } msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0); continue; } if (bp->bio_pflags == G_ELI_NEW_BIO) atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_pflags == G_ELI_NEW_BIO) { bp->bio_pflags = 0; if (sc->sc_flags & G_ELI_FLAG_AUTH) { if (bp->bio_cmd == BIO_READ) g_eli_auth_read(sc, bp); else g_eli_auth_run(wr, bp); } else { if (bp->bio_cmd == BIO_READ) g_eli_crypto_read(sc, bp, 1); else g_eli_crypto_run(wr, bp); } } else { if (sc->sc_flags & G_ELI_FLAG_AUTH) g_eli_auth_run(wr, bp); else g_eli_crypto_run(wr, bp); } } } int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md) { struct g_geom *gp; struct g_consumer *cp; u_char *buf = NULL; int error; g_topology_assert(); gp = g_new_geomf(mp, "eli:taste"); gp->start = g_eli_start; gp->access = g_std_access; /* * g_eli_read_metadata() is always called from the event thread. * Our geom is created and destroyed in the same event, so there * could be no orphan nor spoil event in the meantime. */ gp->orphan = g_eli_orphan_spoil_assert; gp->spoiled = g_eli_orphan_spoil_assert; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) goto end; error = g_access(cp, 1, 0, 0); if (error != 0) goto end; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); if (buf == NULL) goto end; error = eli_metadata_decode(buf, md); if (error != 0) goto end; /* Metadata was read and decoded successfully. */ end: if (buf != NULL) g_free(buf); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, 0, 0); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); return (error); } /* * The function is called when we had last close on provider and user requested * to close it when this situation occur. */ static void g_eli_last_close(void *arg, int flags __unused) { struct g_geom *gp; char gpname[64]; int error; g_topology_assert(); gp = arg; strlcpy(gpname, gp->name, sizeof(gpname)); error = g_eli_destroy(gp->softc, TRUE); KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).", gpname, error)); G_ELI_DEBUG(0, "Detached %s on last close.", gpname); } int g_eli_access(struct g_provider *pp, int dr, int dw, int de) { struct g_eli_softc *sc; struct g_geom *gp; gp = pp->geom; sc = gp->softc; if (dw > 0) { if (sc->sc_flags & G_ELI_FLAG_RO) { /* Deny write attempts. */ return (EROFS); } /* Someone is opening us for write, we need to remember that. */ sc->sc_flags |= G_ELI_FLAG_WOPEN; return (0); } /* Is this the last close? */ if (pp->acr + dr > 0 || pp->acw + dw > 0 || pp->ace + de > 0) return (0); /* * Automatically detach on last close if requested. */ if ((sc->sc_flags & G_ELI_FLAG_RW_DETACH) || (sc->sc_flags & G_ELI_FLAG_WOPEN)) { g_post_event(g_eli_last_close, gp, M_WAITOK, NULL); } return (0); } static int g_eli_cpu_is_disabled(int cpu) { #ifdef SMP return (CPU_ISSET(cpu, &hlt_cpus_mask)); #else return (0); #endif } struct g_geom * g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, const struct g_eli_metadata *md, const u_char *mkey, int nkey) { struct g_eli_softc *sc; struct g_eli_worker *wr; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; u_int i, threads; int error; G_ELI_DEBUG(1, "Creating device %s%s.", bpp->name, G_ELI_SUFFIX); gp = g_new_geomf(mp, "%s%s", bpp->name, G_ELI_SUFFIX); sc = malloc(sizeof(*sc), M_ELI, M_WAITOK | M_ZERO); gp->start = g_eli_start; /* * Spoiling can happen even though we have the provider open * exclusively, e.g. through media change events. */ gp->spoiled = g_eli_orphan; gp->orphan = g_eli_orphan; gp->dumpconf = g_eli_dumpconf; /* * If detach-on-last-close feature is not enabled and we don't operate * on read-only provider, we can simply use g_std_access(). */ if (md->md_flags & (G_ELI_FLAG_WO_DETACH | G_ELI_FLAG_RO)) gp->access = g_eli_access; else gp->access = g_std_access; eli_metadata_softc(sc, md, bpp->sectorsize, bpp->mediasize); sc->sc_nkey = nkey; gp->softc = sc; sc->sc_geom = gp; bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "geli:queue", NULL, MTX_DEF); mtx_init(&sc->sc_ekeys_lock, "geli:ekeys", NULL, MTX_DEF); pp = NULL; cp = g_new_consumer(gp); error = g_attach(cp, bpp); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot attach to %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot attach to %s (error=%d).", bpp->name, error); } goto failed; } /* * Keep provider open all the time, so we can run critical tasks, * like Master Keys deletion, without wondering if we can open * provider or not. * We don't open provider for writing only when user requested read-only * access. */ if (sc->sc_flags & G_ELI_FLAG_RO) error = g_access(cp, 1, 0, 1); else error = g_access(cp, 1, 1, 1); if (error != 0) { if (req != NULL) { gctl_error(req, "Cannot access %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot access %s (error=%d).", bpp->name, error); } goto failed; } /* * Remember the keys in our softc structure. */ g_eli_mkey_propagate(sc, mkey); LIST_INIT(&sc->sc_workers); threads = g_eli_threads; if (threads == 0) threads = mp_ncpus; sc->sc_cpubind = (mp_ncpus > 1 && threads == mp_ncpus); for (i = 0; i < threads; i++) { if (g_eli_cpu_is_disabled(i)) { G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.", bpp->name, i); continue; } wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO); wr->w_softc = sc; wr->w_number = i; wr->w_active = TRUE; error = g_eli_newsession(wr); if (error != 0) { free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot set up crypto session " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot set up crypto session " "for %s (error=%d).", bpp->name, error); } goto failed; } error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0, "g_eli[%u] %s", i, bpp->name); if (error != 0) { g_eli_freesession(wr); free(wr, M_ELI); if (req != NULL) { gctl_error(req, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } else { G_ELI_DEBUG(1, "Cannot create kernel thread " "for %s (error=%d).", bpp->name, error); } goto failed; } LIST_INSERT_HEAD(&sc->sc_workers, wr, w_next); } /* * Create decrypted provider. */ pp = g_new_providerf(gp, "%s%s", bpp->name, G_ELI_SUFFIX); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; g_error_provider(pp, 0); G_ELI_DEBUG(0, "Device %s created.", pp->name); G_ELI_DEBUG(0, "Encryption: %s %u", g_eli_algo2str(sc->sc_ealgo), sc->sc_ekeylen); if (sc->sc_flags & G_ELI_FLAG_AUTH) G_ELI_DEBUG(0, " Integrity: %s", g_eli_algo2str(sc->sc_aalgo)); G_ELI_DEBUG(0, " Crypto: %s", sc->sc_crypto == G_ELI_CRYPTO_SW ? "software" : "hardware"); return (gp); failed: mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); /* * Wait for kernel threads self destruction. */ while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); if (cp->provider != NULL) { if (cp->acr == 1) g_access(cp, -1, -1, -1); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); g_eli_key_destroy(sc); bzero(sc, sizeof(*sc)); free(sc, M_ELI); return (NULL); } int g_eli_destroy(struct g_eli_softc *sc, boolean_t force) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_ELI_DEBUG(1, "Device %s is still open, so it " "cannot be definitely removed.", pp->name); sc->sc_flags |= G_ELI_FLAG_RW_DETACH; gp->access = g_eli_access; g_wither_provider(pp, ENXIO); return (EBUSY); } else { G_ELI_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } mtx_lock(&sc->sc_queue_mtx); sc->sc_flags |= G_ELI_FLAG_DESTROY; wakeup(sc); while (!LIST_EMPTY(&sc->sc_workers)) { msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:destroy", 0); } mtx_destroy(&sc->sc_queue_mtx); gp->softc = NULL; g_eli_key_destroy(sc); bzero(sc, sizeof(*sc)); free(sc, M_ELI); if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) G_ELI_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom_close(gp, ENXIO); return (0); } static int g_eli_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_eli_softc *sc; sc = gp->softc; return (g_eli_destroy(sc, FALSE)); } static int g_eli_keyfiles_load(struct hmac_ctx *ctx, const char *provider) { u_char *keyfile, *data; char *file, name[64]; size_t size; int i; for (i = 0; ; i++) { snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i); keyfile = preload_search_by_type(name); if (keyfile == NULL && i == 0) { /* * If there is only one keyfile, allow simpler name. */ snprintf(name, sizeof(name), "%s:geli_keyfile", provider); keyfile = preload_search_by_type(name); } if (keyfile == NULL) return (i); /* Return number of loaded keyfiles. */ data = preload_fetch_addr(keyfile); if (data == NULL) { G_ELI_DEBUG(0, "Cannot find key file data for %s.", name); return (0); } size = preload_fetch_size(keyfile); if (size == 0) { G_ELI_DEBUG(0, "Cannot find key file size for %s.", name); return (0); } file = preload_search_info(keyfile, MODINFO_NAME); if (file == NULL) { G_ELI_DEBUG(0, "Cannot find key file name for %s.", name); return (0); } G_ELI_DEBUG(1, "Loaded keyfile %s for %s (type: %s).", file, provider, name); g_eli_crypto_hmac_update(ctx, data, size); } } static void g_eli_keyfiles_clear(const char *provider) { u_char *keyfile, *data; char name[64]; size_t size; int i; for (i = 0; ; i++) { snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i); keyfile = preload_search_by_type(name); if (keyfile == NULL) return; data = preload_fetch_addr(keyfile); size = preload_fetch_size(keyfile); if (data != NULL && size != 0) bzero(data, size); } } /* * Tasting is only made on boot. * We detect providers which should be attached before root is mounted. */ static struct g_geom * g_eli_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_eli_metadata md; struct g_geom *gp; struct hmac_ctx ctx; char passphrase[256]; u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN]; u_int i, nkey, nkeyfiles, tries, showpass; int error; struct keybuf *keybuf; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); if (root_mounted() || g_eli_tries == 0) return (NULL); G_ELI_DEBUG(3, "Tasting %s.", pp->name); error = g_eli_read_metadata(mp, pp, &md); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_ELI_MAGIC) != 0) return (NULL); if (md.md_version > G_ELI_VERSION) { printf("geom_eli.ko module is too old to handle %s.\n", pp->name); return (NULL); } if (md.md_provsize != pp->mediasize) return (NULL); /* Should we attach it on boot? */ if (!(md.md_flags & G_ELI_FLAG_BOOT)) return (NULL); if (md.md_keys == 0x00) { G_ELI_DEBUG(0, "No valid keys on %s.", pp->name); return (NULL); } if (md.md_iterations == -1) { /* If there is no passphrase, we try only once. */ tries = 1; } else { /* Ask for the passphrase no more than g_eli_tries times. */ tries = g_eli_tries; } if ((keybuf = get_keybuf()) != NULL) { /* Scan the key buffer, try all GELI keys. */ for (i = 0; i < keybuf->kb_nents; i++) { if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) { memcpy(key, keybuf->kb_ents[i].ke_data, sizeof(key)); if (g_eli_mkey_decrypt(&md, key, mkey, &nkey) == 0 ) { explicit_bzero(key, sizeof(key)); goto have_key; } } } } for (i = 0; i <= tries; i++) { g_eli_crypto_hmac_init(&ctx, NULL, 0); /* * Load all key files. */ nkeyfiles = g_eli_keyfiles_load(&ctx, pp->name); if (nkeyfiles == 0 && md.md_iterations == -1) { /* * No key files and no passphrase, something is * definitely wrong here. * geli(8) doesn't allow for such situation, so assume * that there was really no passphrase and in that case * key files are no properly defined in loader.conf. */ G_ELI_DEBUG(0, "Found no key files in loader.conf for %s.", pp->name); return (NULL); } /* Ask for the passphrase if defined. */ if (md.md_iterations >= 0) { /* Try first with cached passphrase. */ if (i == 0) { if (!g_eli_boot_passcache) continue; memcpy(passphrase, cached_passphrase, sizeof(passphrase)); } else { printf("Enter passphrase for %s: ", pp->name); showpass = g_eli_visible_passphrase; if ((md.md_flags & G_ELI_FLAG_GELIDISPLAYPASS) != 0) showpass = GETS_ECHOPASS; cngets(passphrase, sizeof(passphrase), showpass); memcpy(cached_passphrase, passphrase, sizeof(passphrase)); } } /* * Prepare Derived-Key from the user passphrase. */ if (md.md_iterations == 0) { g_eli_crypto_hmac_update(&ctx, md.md_salt, sizeof(md.md_salt)); g_eli_crypto_hmac_update(&ctx, passphrase, strlen(passphrase)); explicit_bzero(passphrase, sizeof(passphrase)); } else if (md.md_iterations > 0) { u_char dkey[G_ELI_USERKEYLEN]; pkcs5v2_genkey(dkey, sizeof(dkey), md.md_salt, sizeof(md.md_salt), passphrase, md.md_iterations); bzero(passphrase, sizeof(passphrase)); g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey)); explicit_bzero(dkey, sizeof(dkey)); } g_eli_crypto_hmac_final(&ctx, key, 0); /* * Decrypt Master-Key. */ error = g_eli_mkey_decrypt(&md, key, mkey, &nkey); bzero(key, sizeof(key)); if (error == -1) { if (i == tries) { G_ELI_DEBUG(0, "Wrong key for %s. No tries left.", pp->name); g_eli_keyfiles_clear(pp->name); return (NULL); } if (i > 0) { G_ELI_DEBUG(0, "Wrong key for %s. Tries left: %u.", pp->name, tries - i); } /* Try again. */ continue; } else if (error > 0) { G_ELI_DEBUG(0, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); g_eli_keyfiles_clear(pp->name); return (NULL); } g_eli_keyfiles_clear(pp->name); G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); break; } have_key: /* * We have correct key, let's attach provider. */ gp = g_eli_create(NULL, mp, pp, &md, mkey, nkey); bzero(mkey, sizeof(mkey)); bzero(&md, sizeof(md)); if (gp == NULL) { G_ELI_DEBUG(0, "Cannot create device %s%s.", pp->name, G_ELI_SUFFIX); return (NULL); } return (gp); } static void g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_eli_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL || cp != NULL) return; /* Nothing here. */ sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_ekeys_total); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_ekeys_allocated); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if (sc->sc_flags & (flag)) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND"); ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY"); ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER"); ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME"); ADD_FLAG(G_ELI_FLAG_BOOT, "BOOT"); ADD_FLAG(G_ELI_FLAG_WO_DETACH, "W-DETACH"); ADD_FLAG(G_ELI_FLAG_RW_DETACH, "RW-DETACH"); ADD_FLAG(G_ELI_FLAG_AUTH, "AUTH"); ADD_FLAG(G_ELI_FLAG_WOPEN, "W-OPEN"); ADD_FLAG(G_ELI_FLAG_DESTROY, "DESTROY"); ADD_FLAG(G_ELI_FLAG_RO, "READ-ONLY"); ADD_FLAG(G_ELI_FLAG_NODELETE, "NODELETE"); ADD_FLAG(G_ELI_FLAG_GELIBOOT, "GELIBOOT"); ADD_FLAG(G_ELI_FLAG_GELIDISPLAYPASS, "GELIDISPLAYPASS"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_nkey); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_version); sbuf_printf(sb, "%s", indent); switch (sc->sc_crypto) { case G_ELI_CRYPTO_HW: sbuf_printf(sb, "hardware"); break; case G_ELI_CRYPTO_SW: sbuf_printf(sb, "software"); break; default: sbuf_printf(sb, "UNKNOWN"); break; } sbuf_printf(sb, "\n"); if (sc->sc_flags & G_ELI_FLAG_AUTH) { sbuf_printf(sb, "%s%s\n", indent, g_eli_algo2str(sc->sc_aalgo)); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_ekeylen); sbuf_printf(sb, "%s%s\n", indent, g_eli_algo2str(sc->sc_ealgo)); sbuf_printf(sb, "%s%s\n", indent, (sc->sc_flags & G_ELI_FLAG_SUSPEND) ? "SUSPENDED" : "ACTIVE"); } static void g_eli_shutdown_pre_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp; struct g_eli_softc *sc; int error; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { sc = gp->softc; if (sc == NULL) continue; pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name)); if (pp->acr + pp->acw + pp->ace == 0) error = g_eli_destroy(sc, TRUE); else { sc->sc_flags |= G_ELI_FLAG_RW_DETACH; gp->access = g_eli_access; } } g_topology_unlock(); } static void g_eli_init(struct g_class *mp) { g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_eli_pre_sync == NULL) G_ELI_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_eli_fini(struct g_class *mp) { if (g_eli_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync); } DECLARE_GEOM_CLASS(g_eli_class, g_eli); MODULE_DEPEND(g_eli, crypto, 1, 1, 1); Index: head/sys/geom/eli/g_eli.h =================================================================== --- head/sys/geom/eli/g_eli.h (revision 326269) +++ head/sys/geom/eli/g_eli.h (revision 326270) @@ -1,722 +1,724 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_ELI_H_ #define _G_ELI_H_ #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #else #include #include #include #include #endif #include #include #ifndef _OpenSSL_ #include #endif #define G_ELI_CLASS_NAME "ELI" #define G_ELI_MAGIC "GEOM::ELI" #define G_ELI_SUFFIX ".eli" /* * Version history: * 0 - Initial version number. * 1 - Added data authentication support (md_aalgo field and * G_ELI_FLAG_AUTH flag). * 2 - Added G_ELI_FLAG_READONLY. * 3 - Added 'configure' subcommand. * 4 - IV is generated from offset converted to little-endian * (the G_ELI_FLAG_NATIVE_BYTE_ORDER flag will be set for older versions). * 5 - Added multiple encrypton keys and AES-XTS support. * 6 - Fixed usage of multiple keys for authenticated providers (the * G_ELI_FLAG_FIRST_KEY flag will be set for older versions). * 7 - Encryption keys are now generated from the Data Key and not from the * IV Key (the G_ELI_FLAG_ENC_IVKEY flag will be set for older versions). */ #define G_ELI_VERSION_00 0 #define G_ELI_VERSION_01 1 #define G_ELI_VERSION_02 2 #define G_ELI_VERSION_03 3 #define G_ELI_VERSION_04 4 #define G_ELI_VERSION_05 5 #define G_ELI_VERSION_06 6 #define G_ELI_VERSION_07 7 #define G_ELI_VERSION G_ELI_VERSION_07 /* ON DISK FLAGS. */ /* Use random, onetime keys. */ #define G_ELI_FLAG_ONETIME 0x00000001 /* Ask for the passphrase from the kernel, before mounting root. */ #define G_ELI_FLAG_BOOT 0x00000002 /* Detach on last close, if we were open for writing. */ #define G_ELI_FLAG_WO_DETACH 0x00000004 /* Detach on last close. */ #define G_ELI_FLAG_RW_DETACH 0x00000008 /* Provide data authentication. */ #define G_ELI_FLAG_AUTH 0x00000010 /* Provider is read-only, we should deny all write attempts. */ #define G_ELI_FLAG_RO 0x00000020 /* Don't pass through BIO_DELETE requests. */ #define G_ELI_FLAG_NODELETE 0x00000040 /* This GELI supports GELIBoot */ #define G_ELI_FLAG_GELIBOOT 0x00000080 /* Hide passphrase length in GELIboot. */ #define G_ELI_FLAG_GELIDISPLAYPASS 0x00000100 /* RUNTIME FLAGS. */ /* Provider was open for writing. */ #define G_ELI_FLAG_WOPEN 0x00010000 /* Destroy device. */ #define G_ELI_FLAG_DESTROY 0x00020000 /* Provider uses native byte-order for IV generation. */ #define G_ELI_FLAG_NATIVE_BYTE_ORDER 0x00040000 /* Provider uses single encryption key. */ #define G_ELI_FLAG_SINGLE_KEY 0x00080000 /* Device suspended. */ #define G_ELI_FLAG_SUSPEND 0x00100000 /* Provider uses first encryption key. */ #define G_ELI_FLAG_FIRST_KEY 0x00200000 /* Provider uses IV-Key for encryption key generation. */ #define G_ELI_FLAG_ENC_IVKEY 0x00400000 #define G_ELI_NEW_BIO 255 #define SHA512_MDLEN 64 #define G_ELI_AUTH_SECKEYLEN SHA256_DIGEST_LENGTH #define G_ELI_MAXMKEYS 2 #define G_ELI_MAXKEYLEN 64 #define G_ELI_USERKEYLEN G_ELI_MAXKEYLEN #define G_ELI_DATAKEYLEN G_ELI_MAXKEYLEN #define G_ELI_AUTHKEYLEN G_ELI_MAXKEYLEN #define G_ELI_IVKEYLEN G_ELI_MAXKEYLEN #define G_ELI_SALTLEN 64 #define G_ELI_DATAIVKEYLEN (G_ELI_DATAKEYLEN + G_ELI_IVKEYLEN) /* Data-Key, IV-Key, HMAC_SHA512(Derived-Key, Data-Key+IV-Key) */ #define G_ELI_MKEYLEN (G_ELI_DATAIVKEYLEN + SHA512_MDLEN) #define G_ELI_OVERWRITES 5 /* Switch data encryption key every 2^20 blocks. */ #define G_ELI_KEY_SHIFT 20 #define G_ELI_CRYPTO_UNKNOWN 0 #define G_ELI_CRYPTO_HW 1 #define G_ELI_CRYPTO_SW 2 #ifdef _KERNEL #if (MAX_KEY_BYTES < G_ELI_DATAIVKEYLEN) #error "MAX_KEY_BYTES is less than G_ELI_DATAKEYLEN" #endif extern int g_eli_debug; extern u_int g_eli_overwrites; extern u_int g_eli_batch; #define G_ELI_DEBUG(lvl, ...) do { \ if (g_eli_debug >= (lvl)) { \ printf("GEOM_ELI"); \ if (g_eli_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_ELI_LOGREQ(lvl, bp, ...) do { \ if (g_eli_debug >= (lvl)) { \ printf("GEOM_ELI"); \ if (g_eli_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) struct g_eli_worker { struct g_eli_softc *w_softc; struct proc *w_proc; u_int w_number; uint64_t w_sid; boolean_t w_active; LIST_ENTRY(g_eli_worker) w_next; }; #endif /* _KERNEL */ struct g_eli_softc { struct g_geom *sc_geom; u_int sc_version; u_int sc_crypto; uint8_t sc_mkey[G_ELI_DATAIVKEYLEN]; uint8_t sc_ekey[G_ELI_DATAKEYLEN]; TAILQ_HEAD(, g_eli_key) sc_ekeys_queue; RB_HEAD(g_eli_key_tree, g_eli_key) sc_ekeys_tree; struct mtx sc_ekeys_lock; uint64_t sc_ekeys_total; uint64_t sc_ekeys_allocated; u_int sc_ealgo; u_int sc_ekeylen; uint8_t sc_akey[G_ELI_AUTHKEYLEN]; u_int sc_aalgo; u_int sc_akeylen; u_int sc_alen; SHA256_CTX sc_akeyctx; uint8_t sc_ivkey[G_ELI_IVKEYLEN]; SHA256_CTX sc_ivctx; int sc_nkey; uint32_t sc_flags; int sc_inflight; off_t sc_mediasize; size_t sc_sectorsize; u_int sc_bytes_per_sector; u_int sc_data_per_sector; #ifndef _KERNEL int sc_cpubind; #else /* _KERNEL */ boolean_t sc_cpubind; /* Only for software cryptography. */ struct bio_queue_head sc_queue; struct mtx sc_queue_mtx; LIST_HEAD(, g_eli_worker) sc_workers; #endif /* _KERNEL */ }; #define sc_name sc_geom->name #define G_ELI_KEY_MAGIC 0xe11341c struct g_eli_key { /* Key value, must be first in the structure. */ uint8_t gek_key[G_ELI_DATAKEYLEN]; /* Magic. */ int gek_magic; /* Key number. */ uint64_t gek_keyno; /* Reference counter. */ int gek_count; /* Keeps keys sorted by most recent use. */ TAILQ_ENTRY(g_eli_key) gek_next; /* Keeps keys sorted by number. */ RB_ENTRY(g_eli_key) gek_link; }; struct g_eli_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ uint32_t md_flags; /* Additional flags. */ uint16_t md_ealgo; /* Encryption algorithm. */ uint16_t md_keylen; /* Key length. */ uint16_t md_aalgo; /* Authentication algorithm. */ uint64_t md_provsize; /* Provider's size. */ uint32_t md_sectorsize; /* Sector size. */ uint8_t md_keys; /* Available keys. */ int32_t md_iterations; /* Number of iterations for PKCS#5v2. */ uint8_t md_salt[G_ELI_SALTLEN]; /* Salt. */ /* Encrypted master key (IV-key, Data-key, HMAC). */ uint8_t md_mkeys[G_ELI_MAXMKEYS * G_ELI_MKEYLEN]; u_char md_hash[16]; /* MD5 hash. */ } __packed; #ifndef _OpenSSL_ static __inline void eli_metadata_encode_v0(struct g_eli_metadata *md, u_char **datap) { u_char *p; p = *datap; le32enc(p, md->md_flags); p += sizeof(md->md_flags); le16enc(p, md->md_ealgo); p += sizeof(md->md_ealgo); le16enc(p, md->md_keylen); p += sizeof(md->md_keylen); le64enc(p, md->md_provsize); p += sizeof(md->md_provsize); le32enc(p, md->md_sectorsize); p += sizeof(md->md_sectorsize); *p = md->md_keys; p += sizeof(md->md_keys); le32enc(p, md->md_iterations); p += sizeof(md->md_iterations); bcopy(md->md_salt, p, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(md->md_mkeys, p, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); *datap = p; } static __inline void eli_metadata_encode_v1v2v3v4v5v6v7(struct g_eli_metadata *md, u_char **datap) { u_char *p; p = *datap; le32enc(p, md->md_flags); p += sizeof(md->md_flags); le16enc(p, md->md_ealgo); p += sizeof(md->md_ealgo); le16enc(p, md->md_keylen); p += sizeof(md->md_keylen); le16enc(p, md->md_aalgo); p += sizeof(md->md_aalgo); le64enc(p, md->md_provsize); p += sizeof(md->md_provsize); le32enc(p, md->md_sectorsize); p += sizeof(md->md_sectorsize); *p = md->md_keys; p += sizeof(md->md_keys); le32enc(p, md->md_iterations); p += sizeof(md->md_iterations); bcopy(md->md_salt, p, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(md->md_mkeys, p, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); *datap = p; } static __inline void eli_metadata_encode(struct g_eli_metadata *md, u_char *data) { uint32_t hash[4]; MD5_CTX ctx; u_char *p; p = data; bcopy(md->md_magic, p, sizeof(md->md_magic)); p += sizeof(md->md_magic); le32enc(p, md->md_version); p += sizeof(md->md_version); switch (md->md_version) { case G_ELI_VERSION_00: eli_metadata_encode_v0(md, &p); break; case G_ELI_VERSION_01: case G_ELI_VERSION_02: case G_ELI_VERSION_03: case G_ELI_VERSION_04: case G_ELI_VERSION_05: case G_ELI_VERSION_06: case G_ELI_VERSION_07: eli_metadata_encode_v1v2v3v4v5v6v7(md, &p); break; default: #ifdef _KERNEL panic("%s: Unsupported version %u.", __func__, (u_int)md->md_version); #else assert(!"Unsupported metadata version."); #endif } MD5Init(&ctx); MD5Update(&ctx, data, p - data); MD5Final((void *)hash, &ctx); bcopy(hash, md->md_hash, sizeof(md->md_hash)); bcopy(md->md_hash, p, sizeof(md->md_hash)); } static __inline int eli_metadata_decode_v0(const u_char *data, struct g_eli_metadata *md) { uint32_t hash[4]; MD5_CTX ctx; const u_char *p; p = data + sizeof(md->md_magic) + sizeof(md->md_version); md->md_flags = le32dec(p); p += sizeof(md->md_flags); md->md_ealgo = le16dec(p); p += sizeof(md->md_ealgo); md->md_keylen = le16dec(p); p += sizeof(md->md_keylen); md->md_provsize = le64dec(p); p += sizeof(md->md_provsize); md->md_sectorsize = le32dec(p); p += sizeof(md->md_sectorsize); md->md_keys = *p; p += sizeof(md->md_keys); md->md_iterations = le32dec(p); p += sizeof(md->md_iterations); bcopy(p, md->md_salt, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(p, md->md_mkeys, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); MD5Init(&ctx); MD5Update(&ctx, data, p - data); MD5Final((void *)hash, &ctx); bcopy(hash, md->md_hash, sizeof(md->md_hash)); if (bcmp(md->md_hash, p, 16) != 0) return (EINVAL); return (0); } static __inline int eli_metadata_decode_v1v2v3v4v5v6v7(const u_char *data, struct g_eli_metadata *md) { uint32_t hash[4]; MD5_CTX ctx; const u_char *p; p = data + sizeof(md->md_magic) + sizeof(md->md_version); md->md_flags = le32dec(p); p += sizeof(md->md_flags); md->md_ealgo = le16dec(p); p += sizeof(md->md_ealgo); md->md_keylen = le16dec(p); p += sizeof(md->md_keylen); md->md_aalgo = le16dec(p); p += sizeof(md->md_aalgo); md->md_provsize = le64dec(p); p += sizeof(md->md_provsize); md->md_sectorsize = le32dec(p); p += sizeof(md->md_sectorsize); md->md_keys = *p; p += sizeof(md->md_keys); md->md_iterations = le32dec(p); p += sizeof(md->md_iterations); bcopy(p, md->md_salt, sizeof(md->md_salt)); p += sizeof(md->md_salt); bcopy(p, md->md_mkeys, sizeof(md->md_mkeys)); p += sizeof(md->md_mkeys); MD5Init(&ctx); MD5Update(&ctx, data, p - data); MD5Final((void *)hash, &ctx); bcopy(hash, md->md_hash, sizeof(md->md_hash)); if (bcmp(md->md_hash, p, 16) != 0) return (EINVAL); return (0); } static __inline int eli_metadata_decode(const u_char *data, struct g_eli_metadata *md) { int error; bcopy(data, md->md_magic, sizeof(md->md_magic)); if (strcmp(md->md_magic, G_ELI_MAGIC) != 0) return (EINVAL); md->md_version = le32dec(data + sizeof(md->md_magic)); switch (md->md_version) { case G_ELI_VERSION_00: error = eli_metadata_decode_v0(data, md); break; case G_ELI_VERSION_01: case G_ELI_VERSION_02: case G_ELI_VERSION_03: case G_ELI_VERSION_04: case G_ELI_VERSION_05: case G_ELI_VERSION_06: case G_ELI_VERSION_07: error = eli_metadata_decode_v1v2v3v4v5v6v7(data, md); break; default: error = EOPNOTSUPP; break; } return (error); } #endif /* !_OpenSSL */ static __inline u_int g_eli_str2ealgo(const char *name) { if (strcasecmp("null", name) == 0) return (CRYPTO_NULL_CBC); else if (strcasecmp("null-cbc", name) == 0) return (CRYPTO_NULL_CBC); else if (strcasecmp("aes", name) == 0) return (CRYPTO_AES_XTS); else if (strcasecmp("aes-cbc", name) == 0) return (CRYPTO_AES_CBC); else if (strcasecmp("aes-xts", name) == 0) return (CRYPTO_AES_XTS); else if (strcasecmp("blowfish", name) == 0) return (CRYPTO_BLF_CBC); else if (strcasecmp("blowfish-cbc", name) == 0) return (CRYPTO_BLF_CBC); else if (strcasecmp("camellia", name) == 0) return (CRYPTO_CAMELLIA_CBC); else if (strcasecmp("camellia-cbc", name) == 0) return (CRYPTO_CAMELLIA_CBC); else if (strcasecmp("3des", name) == 0) return (CRYPTO_3DES_CBC); else if (strcasecmp("3des-cbc", name) == 0) return (CRYPTO_3DES_CBC); return (CRYPTO_ALGORITHM_MIN - 1); } static __inline u_int g_eli_str2aalgo(const char *name) { if (strcasecmp("hmac/md5", name) == 0) return (CRYPTO_MD5_HMAC); else if (strcasecmp("hmac/sha1", name) == 0) return (CRYPTO_SHA1_HMAC); else if (strcasecmp("hmac/ripemd160", name) == 0) return (CRYPTO_RIPEMD160_HMAC); else if (strcasecmp("hmac/sha256", name) == 0) return (CRYPTO_SHA2_256_HMAC); else if (strcasecmp("hmac/sha384", name) == 0) return (CRYPTO_SHA2_384_HMAC); else if (strcasecmp("hmac/sha512", name) == 0) return (CRYPTO_SHA2_512_HMAC); return (CRYPTO_ALGORITHM_MIN - 1); } static __inline const char * g_eli_algo2str(u_int algo) { switch (algo) { case CRYPTO_NULL_CBC: return ("NULL"); case CRYPTO_AES_CBC: return ("AES-CBC"); case CRYPTO_AES_XTS: return ("AES-XTS"); case CRYPTO_BLF_CBC: return ("Blowfish-CBC"); case CRYPTO_CAMELLIA_CBC: return ("CAMELLIA-CBC"); case CRYPTO_3DES_CBC: return ("3DES-CBC"); case CRYPTO_MD5_HMAC: return ("HMAC/MD5"); case CRYPTO_SHA1_HMAC: return ("HMAC/SHA1"); case CRYPTO_RIPEMD160_HMAC: return ("HMAC/RIPEMD160"); case CRYPTO_SHA2_256_HMAC: return ("HMAC/SHA256"); case CRYPTO_SHA2_384_HMAC: return ("HMAC/SHA384"); case CRYPTO_SHA2_512_HMAC: return ("HMAC/SHA512"); } return ("unknown"); } static __inline void eli_metadata_dump(const struct g_eli_metadata *md) { static const char hex[] = "0123456789abcdef"; char str[sizeof(md->md_mkeys) * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" flags: 0x%x\n", (u_int)md->md_flags); printf(" ealgo: %s\n", g_eli_algo2str(md->md_ealgo)); printf(" keylen: %u\n", (u_int)md->md_keylen); if (md->md_flags & G_ELI_FLAG_AUTH) printf(" aalgo: %s\n", g_eli_algo2str(md->md_aalgo)); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); printf("sectorsize: %u\n", (u_int)md->md_sectorsize); printf(" keys: 0x%02x\n", (u_int)md->md_keys); printf("iterations: %d\n", (int)md->md_iterations); bzero(str, sizeof(str)); for (i = 0; i < sizeof(md->md_salt); i++) { str[i * 2] = hex[md->md_salt[i] >> 4]; str[i * 2 + 1] = hex[md->md_salt[i] & 0x0f]; } printf(" Salt: %s\n", str); bzero(str, sizeof(str)); for (i = 0; i < sizeof(md->md_mkeys); i++) { str[i * 2] = hex[md->md_mkeys[i] >> 4]; str[i * 2 + 1] = hex[md->md_mkeys[i] & 0x0f]; } printf("Master Key: %s\n", str); bzero(str, sizeof(str)); for (i = 0; i < 16; i++) { str[i * 2] = hex[md->md_hash[i] >> 4]; str[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", str); } static __inline u_int g_eli_keylen(u_int algo, u_int keylen) { switch (algo) { case CRYPTO_NULL_CBC: if (keylen == 0) keylen = 64 * 8; else { if (keylen > 64 * 8) keylen = 0; } return (keylen); case CRYPTO_AES_CBC: case CRYPTO_CAMELLIA_CBC: switch (keylen) { case 0: return (128); case 128: case 192: case 256: return (keylen); default: return (0); } case CRYPTO_AES_XTS: switch (keylen) { case 0: return (128); case 128: case 256: return (keylen); default: return (0); } case CRYPTO_BLF_CBC: if (keylen == 0) return (128); if (keylen < 128 || keylen > 448) return (0); if ((keylen % 32) != 0) return (0); return (keylen); case CRYPTO_3DES_CBC: if (keylen == 0 || keylen == 192) return (192); return (0); default: return (0); } } static __inline u_int g_eli_hashlen(u_int algo) { switch (algo) { case CRYPTO_MD5_HMAC: return (16); case CRYPTO_SHA1_HMAC: return (20); case CRYPTO_RIPEMD160_HMAC: return (20); case CRYPTO_SHA2_256_HMAC: return (32); case CRYPTO_SHA2_384_HMAC: return (48); case CRYPTO_SHA2_512_HMAC: return (64); } return (0); } static __inline void eli_metadata_softc(struct g_eli_softc *sc, const struct g_eli_metadata *md, u_int sectorsize, off_t mediasize) { sc->sc_version = md->md_version; sc->sc_inflight = 0; sc->sc_crypto = G_ELI_CRYPTO_UNKNOWN; sc->sc_flags = md->md_flags; /* Backward compatibility. */ if (md->md_version < G_ELI_VERSION_04) sc->sc_flags |= G_ELI_FLAG_NATIVE_BYTE_ORDER; if (md->md_version < G_ELI_VERSION_05) sc->sc_flags |= G_ELI_FLAG_SINGLE_KEY; if (md->md_version < G_ELI_VERSION_06 && (sc->sc_flags & G_ELI_FLAG_AUTH) != 0) { sc->sc_flags |= G_ELI_FLAG_FIRST_KEY; } if (md->md_version < G_ELI_VERSION_07) sc->sc_flags |= G_ELI_FLAG_ENC_IVKEY; sc->sc_ealgo = md->md_ealgo; if (sc->sc_flags & G_ELI_FLAG_AUTH) { sc->sc_akeylen = sizeof(sc->sc_akey) * 8; sc->sc_aalgo = md->md_aalgo; sc->sc_alen = g_eli_hashlen(sc->sc_aalgo); sc->sc_data_per_sector = sectorsize - sc->sc_alen; /* * Some hash functions (like SHA1 and RIPEMD160) generates hash * which length is not multiple of 128 bits, but we want data * length to be multiple of 128, so we can encrypt without * padding. The line below rounds down data length to multiple * of 128 bits. */ sc->sc_data_per_sector -= sc->sc_data_per_sector % 16; sc->sc_bytes_per_sector = (md->md_sectorsize - 1) / sc->sc_data_per_sector + 1; sc->sc_bytes_per_sector *= sectorsize; } sc->sc_sectorsize = md->md_sectorsize; sc->sc_mediasize = mediasize; if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) sc->sc_mediasize -= sectorsize; if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) sc->sc_mediasize -= (sc->sc_mediasize % sc->sc_sectorsize); else { sc->sc_mediasize /= sc->sc_bytes_per_sector; sc->sc_mediasize *= sc->sc_sectorsize; } sc->sc_ekeylen = md->md_keylen; } #ifdef _KERNEL int g_eli_read_metadata(struct g_class *mp, struct g_provider *pp, struct g_eli_metadata *md); struct g_geom *g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp, const struct g_eli_metadata *md, const u_char *mkey, int nkey); int g_eli_destroy(struct g_eli_softc *sc, boolean_t force); int g_eli_access(struct g_provider *pp, int dr, int dw, int de); void g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb); void g_eli_read_done(struct bio *bp); void g_eli_write_done(struct bio *bp); int g_eli_crypto_rerun(struct cryptop *crp); void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker); void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp); void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp); void g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp); #endif void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv, size_t size); void g_eli_mkey_hmac(unsigned char *mkey, const unsigned char *key); int g_eli_mkey_decrypt(const struct g_eli_metadata *md, const unsigned char *key, unsigned char *mkey, unsigned *nkeyp); int g_eli_mkey_encrypt(unsigned algo, const unsigned char *key, unsigned keylen, unsigned char *mkey); #ifdef _KERNEL void g_eli_mkey_propagate(struct g_eli_softc *sc, const unsigned char *mkey); #endif int g_eli_crypto_encrypt(u_int algo, u_char *data, size_t datasize, const u_char *key, size_t keysize); int g_eli_crypto_decrypt(u_int algo, u_char *data, size_t datasize, const u_char *key, size_t keysize); struct hmac_ctx { SHA512_CTX innerctx; SHA512_CTX outerctx; }; void g_eli_crypto_hmac_init(struct hmac_ctx *ctx, const uint8_t *hkey, size_t hkeylen); void g_eli_crypto_hmac_update(struct hmac_ctx *ctx, const uint8_t *data, size_t datasize); void g_eli_crypto_hmac_final(struct hmac_ctx *ctx, uint8_t *md, size_t mdsize); void g_eli_crypto_hmac(const uint8_t *hkey, size_t hkeysize, const uint8_t *data, size_t datasize, uint8_t *md, size_t mdsize); void g_eli_key_fill(struct g_eli_softc *sc, struct g_eli_key *key, uint64_t keyno); #ifdef _KERNEL void g_eli_key_init(struct g_eli_softc *sc); void g_eli_key_destroy(struct g_eli_softc *sc); uint8_t *g_eli_key_hold(struct g_eli_softc *sc, off_t offset, size_t blocksize); void g_eli_key_drop(struct g_eli_softc *sc, uint8_t *rawkey); #endif #endif /* !_G_ELI_H_ */ Index: head/sys/geom/eli/g_eli_crypto.c =================================================================== --- head/sys/geom/eli/g_eli_crypto.c (revision 326269) +++ head/sys/geom/eli/g_eli_crypto.c (revision 326270) @@ -1,223 +1,225 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2010 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #else #include #include #include #include #include #include #define _OpenSSL_ #endif #include #ifdef _KERNEL MALLOC_DECLARE(M_ELI); static int g_eli_crypto_done(struct cryptop *crp) { crp->crp_opaque = (void *)crp; wakeup(crp); return (0); } static int g_eli_crypto_cipher(u_int algo, int enc, u_char *data, size_t datasize, const u_char *key, size_t keysize) { struct cryptoini cri; struct cryptop *crp; struct cryptodesc *crd; uint64_t sid; u_char *p; int error; KASSERT(algo != CRYPTO_AES_XTS, ("%s: CRYPTO_AES_XTS unexpected here", __func__)); bzero(&cri, sizeof(cri)); cri.cri_alg = algo; cri.cri_key = __DECONST(void *, key); cri.cri_klen = keysize; error = crypto_newsession(&sid, &cri, CRYPTOCAP_F_SOFTWARE); if (error != 0) return (error); p = malloc(sizeof(*crp) + sizeof(*crd), M_ELI, M_NOWAIT | M_ZERO); if (p == NULL) { crypto_freesession(sid); return (ENOMEM); } crp = (struct cryptop *)p; p += sizeof(*crp); crd = (struct cryptodesc *)p; p += sizeof(*crd); crd->crd_skip = 0; crd->crd_len = datasize; crd->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; if (enc) crd->crd_flags |= CRD_F_ENCRYPT; crd->crd_alg = algo; crd->crd_key = __DECONST(void *, key); crd->crd_klen = keysize; bzero(crd->crd_iv, sizeof(crd->crd_iv)); crd->crd_next = NULL; crp->crp_sid = sid; crp->crp_ilen = datasize; crp->crp_olen = datasize; crp->crp_opaque = NULL; crp->crp_callback = g_eli_crypto_done; crp->crp_buf = (void *)data; crp->crp_flags = CRYPTO_F_CBIFSYNC; crp->crp_desc = crd; error = crypto_dispatch(crp); if (error == 0) { while (crp->crp_opaque == NULL) tsleep(crp, PRIBIO, "geli", hz / 5); error = crp->crp_etype; } free(crp, M_ELI); crypto_freesession(sid); return (error); } #else /* !_KERNEL */ static int g_eli_crypto_cipher(u_int algo, int enc, u_char *data, size_t datasize, const u_char *key, size_t keysize) { EVP_CIPHER_CTX ctx; const EVP_CIPHER *type; u_char iv[keysize]; int outsize; assert(algo != CRYPTO_AES_XTS); switch (algo) { case CRYPTO_NULL_CBC: type = EVP_enc_null(); break; case CRYPTO_AES_CBC: switch (keysize) { case 128: type = EVP_aes_128_cbc(); break; case 192: type = EVP_aes_192_cbc(); break; case 256: type = EVP_aes_256_cbc(); break; default: return (EINVAL); } break; case CRYPTO_BLF_CBC: type = EVP_bf_cbc(); break; #ifndef OPENSSL_NO_CAMELLIA case CRYPTO_CAMELLIA_CBC: switch (keysize) { case 128: type = EVP_camellia_128_cbc(); break; case 192: type = EVP_camellia_192_cbc(); break; case 256: type = EVP_camellia_256_cbc(); break; default: return (EINVAL); } break; #endif case CRYPTO_3DES_CBC: type = EVP_des_ede3_cbc(); break; default: return (EINVAL); } EVP_CIPHER_CTX_init(&ctx); EVP_CipherInit_ex(&ctx, type, NULL, NULL, NULL, enc); EVP_CIPHER_CTX_set_key_length(&ctx, keysize / 8); EVP_CIPHER_CTX_set_padding(&ctx, 0); bzero(iv, sizeof(iv)); EVP_CipherInit_ex(&ctx, NULL, NULL, key, iv, enc); if (EVP_CipherUpdate(&ctx, data, &outsize, data, datasize) == 0) { EVP_CIPHER_CTX_cleanup(&ctx); return (EINVAL); } assert(outsize == (int)datasize); if (EVP_CipherFinal_ex(&ctx, data + outsize, &outsize) == 0) { EVP_CIPHER_CTX_cleanup(&ctx); return (EINVAL); } assert(outsize == 0); EVP_CIPHER_CTX_cleanup(&ctx); return (0); } #endif /* !_KERNEL */ int g_eli_crypto_encrypt(u_int algo, u_char *data, size_t datasize, const u_char *key, size_t keysize) { /* We prefer AES-CBC for metadata protection. */ if (algo == CRYPTO_AES_XTS) algo = CRYPTO_AES_CBC; return (g_eli_crypto_cipher(algo, 1, data, datasize, key, keysize)); } int g_eli_crypto_decrypt(u_int algo, u_char *data, size_t datasize, const u_char *key, size_t keysize) { /* We prefer AES-CBC for metadata protection. */ if (algo == CRYPTO_AES_XTS) algo = CRYPTO_AES_CBC; return (g_eli_crypto_cipher(algo, 0, data, datasize, key, keysize)); } Index: head/sys/geom/eli/g_eli_ctl.c =================================================================== --- head/sys/geom/eli/g_eli_ctl.c (revision 326269) +++ head/sys/geom/eli/g_eli_ctl.c (revision 326270) @@ -1,1163 +1,1165 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DECLARE(M_ELI); static void g_eli_ctl_attach(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_provider *pp; const char *name; u_char *key, mkey[G_ELI_DATAIVKEYLEN]; int *nargs, *detach, *readonly; int keysize, error; u_int nkey; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } detach = gctl_get_paraml(req, "detach", sizeof(*detach)); if (detach == NULL) { gctl_error(req, "No '%s' argument.", "detach"); return; } readonly = gctl_get_paraml(req, "readonly", sizeof(*readonly)); if (readonly == NULL) { gctl_error(req, "No '%s' argument.", "readonly"); return; } if (*detach && *readonly) { gctl_error(req, "Options -d and -r are mutually exclusive."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } if (md.md_keys == 0x00) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No valid keys on %s.", pp->name); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No '%s' argument.", "key"); return; } error = g_eli_mkey_decrypt(&md, key, mkey, &nkey); explicit_bzero(key, keysize); if (error == -1) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Wrong key for %s.", pp->name); return; } else if (error > 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); if (*detach) md.md_flags |= G_ELI_FLAG_WO_DETACH; if (*readonly) md.md_flags |= G_ELI_FLAG_RO; g_eli_create(req, mp, pp, &md, mkey, nkey); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static struct g_eli_softc * g_eli_find_device(struct g_class *mp, const char *prov) { struct g_eli_softc *sc; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; if (strncmp(prov, "/dev/", strlen("/dev/")) == 0) prov += strlen("/dev/"); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; pp = LIST_FIRST(&gp->provider); if (pp != NULL && strcmp(pp->name, prov) == 0) return (sc); cp = LIST_FIRST(&gp->consumer); if (cp != NULL && cp->provider != NULL && strcmp(cp->provider->name, prov) == 0) { return (sc); } } return (NULL); } static void g_eli_ctl_detach(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; int *force, *last, *nargs, error; const char *prov; char param[16]; int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } last = gctl_get_paraml(req, "last", sizeof(*last)); if (last == NULL) { gctl_error(req, "No '%s' argument.", "last"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { gctl_error(req, "No such device: %s.", prov); return; } if (*last) { sc->sc_flags |= G_ELI_FLAG_RW_DETACH; sc->sc_geom->access = g_eli_access; } else { error = g_eli_destroy(sc, *force ? TRUE : FALSE); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } } static void g_eli_ctl_onetime(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_provider *pp; const char *name; intmax_t *keylen, *sectorsize; u_char mkey[G_ELI_DATAIVKEYLEN]; int *nargs, *detach, *notrim; g_topology_assert(); bzero(&md, sizeof(md)); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } strlcpy(md.md_magic, G_ELI_MAGIC, sizeof(md.md_magic)); md.md_version = G_ELI_VERSION; md.md_flags |= G_ELI_FLAG_ONETIME; detach = gctl_get_paraml(req, "detach", sizeof(*detach)); if (detach != NULL && *detach) md.md_flags |= G_ELI_FLAG_WO_DETACH; notrim = gctl_get_paraml(req, "notrim", sizeof(*notrim)); if (notrim != NULL && *notrim) md.md_flags |= G_ELI_FLAG_NODELETE; md.md_ealgo = CRYPTO_ALGORITHM_MIN - 1; name = gctl_get_asciiparam(req, "aalgo"); if (name == NULL) { gctl_error(req, "No '%s' argument.", "aalgo"); return; } if (*name != '\0') { md.md_aalgo = g_eli_str2aalgo(name); if (md.md_aalgo >= CRYPTO_ALGORITHM_MIN && md.md_aalgo <= CRYPTO_ALGORITHM_MAX) { md.md_flags |= G_ELI_FLAG_AUTH; } else { /* * For backward compatibility, check if the -a option * was used to provide encryption algorithm. */ md.md_ealgo = g_eli_str2ealgo(name); if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { gctl_error(req, "Invalid authentication algorithm."); return; } else { gctl_error(req, "warning: The -e option, not " "the -a option is now used to specify " "encryption algorithm to use."); } } } if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { name = gctl_get_asciiparam(req, "ealgo"); if (name == NULL) { gctl_error(req, "No '%s' argument.", "ealgo"); return; } md.md_ealgo = g_eli_str2ealgo(name); if (md.md_ealgo < CRYPTO_ALGORITHM_MIN || md.md_ealgo > CRYPTO_ALGORITHM_MAX) { gctl_error(req, "Invalid encryption algorithm."); return; } } keylen = gctl_get_paraml(req, "keylen", sizeof(*keylen)); if (keylen == NULL) { gctl_error(req, "No '%s' argument.", "keylen"); return; } md.md_keylen = g_eli_keylen(md.md_ealgo, *keylen); if (md.md_keylen == 0) { gctl_error(req, "Invalid '%s' argument.", "keylen"); return; } /* Not important here. */ md.md_provsize = 0; /* Not important here. */ bzero(md.md_salt, sizeof(md.md_salt)); md.md_keys = 0x01; arc4rand(mkey, sizeof(mkey), 0); /* Not important here. */ bzero(md.md_hash, sizeof(md.md_hash)); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } sectorsize = gctl_get_paraml(req, "sectorsize", sizeof(*sectorsize)); if (sectorsize == NULL) { gctl_error(req, "No '%s' argument.", "sectorsize"); return; } if (*sectorsize == 0) md.md_sectorsize = pp->sectorsize; else { if (*sectorsize < 0 || (*sectorsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid sector size."); return; } if (*sectorsize > PAGE_SIZE) { gctl_error(req, "warning: Using sectorsize bigger than " "the page size!"); } md.md_sectorsize = *sectorsize; } g_eli_create(req, mp, pp, &md, mkey, -1); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static void g_eli_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; char param[16]; const char *prov; u_char *sector; int *nargs, *boot, *noboot, *trim, *notrim, *geliboot, *nogeliboot; int *displaypass, *nodisplaypass; int zero, error, changed; u_int i; g_topology_assert(); changed = 0; zero = 0; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } boot = gctl_get_paraml(req, "boot", sizeof(*boot)); if (boot == NULL) boot = &zero; noboot = gctl_get_paraml(req, "noboot", sizeof(*noboot)); if (noboot == NULL) noboot = &zero; if (*boot && *noboot) { gctl_error(req, "Options -b and -B are mutually exclusive."); return; } if (*boot || *noboot) changed = 1; trim = gctl_get_paraml(req, "trim", sizeof(*trim)); if (trim == NULL) trim = &zero; notrim = gctl_get_paraml(req, "notrim", sizeof(*notrim)); if (notrim == NULL) notrim = &zero; if (*trim && *notrim) { gctl_error(req, "Options -t and -T are mutually exclusive."); return; } if (*trim || *notrim) changed = 1; geliboot = gctl_get_paraml(req, "geliboot", sizeof(*geliboot)); if (geliboot == NULL) geliboot = &zero; nogeliboot = gctl_get_paraml(req, "nogeliboot", sizeof(*nogeliboot)); if (nogeliboot == NULL) nogeliboot = &zero; if (*geliboot && *nogeliboot) { gctl_error(req, "Options -g and -G are mutually exclusive."); return; } if (*geliboot || *nogeliboot) changed = 1; displaypass = gctl_get_paraml(req, "displaypass", sizeof(*displaypass)); if (displaypass == NULL) displaypass = &zero; nodisplaypass = gctl_get_paraml(req, "nodisplaypass", sizeof(*nodisplaypass)); if (nodisplaypass == NULL) nodisplaypass = &zero; if (*displaypass && *nodisplaypass) { gctl_error(req, "Options -d and -D are mutually exclusive."); return; } if (*displaypass || *nodisplaypass) changed = 1; if (!changed) { gctl_error(req, "No option given."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { /* * We ignore not attached providers, userland part will * take care of them. */ G_ELI_DEBUG(1, "Skipping configuration of not attached " "provider %s.", prov); continue; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot change configuration of " "read-only provider %s.", prov); continue; } if (*boot && (sc->sc_flags & G_ELI_FLAG_BOOT)) { G_ELI_DEBUG(1, "BOOT flag already configured for %s.", prov); continue; } else if (*noboot && !(sc->sc_flags & G_ELI_FLAG_BOOT)) { G_ELI_DEBUG(1, "BOOT flag not configured for %s.", prov); continue; } if (*notrim && (sc->sc_flags & G_ELI_FLAG_NODELETE)) { G_ELI_DEBUG(1, "TRIM disable flag already configured for %s.", prov); continue; } else if (*trim && !(sc->sc_flags & G_ELI_FLAG_NODELETE)) { G_ELI_DEBUG(1, "TRIM disable flag not configured for %s.", prov); continue; } if (*geliboot && (sc->sc_flags & G_ELI_FLAG_GELIBOOT)) { G_ELI_DEBUG(1, "GELIBOOT flag already configured for %s.", prov); continue; } else if (*nogeliboot && !(sc->sc_flags & G_ELI_FLAG_GELIBOOT)) { G_ELI_DEBUG(1, "GELIBOOT flag not configured for %s.", prov); continue; } if (*displaypass && (sc->sc_flags & G_ELI_FLAG_GELIDISPLAYPASS)) { G_ELI_DEBUG(1, "GELIDISPLAYPASS flag already configured for %s.", prov); continue; } else if (*nodisplaypass && !(sc->sc_flags & G_ELI_FLAG_GELIDISPLAYPASS)) { G_ELI_DEBUG(1, "GELIDISPLAYPASS flag not configured for %s.", prov); continue; } if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) { /* * ONETIME providers don't write metadata to * disk, so don't try reading it. This means * we're bit-flipping uninitialized memory in md * below, but that's OK; we don't do anything * with it later. */ cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", prov, error); continue; } } if (*boot) { md.md_flags |= G_ELI_FLAG_BOOT; sc->sc_flags |= G_ELI_FLAG_BOOT; } else if (*noboot) { md.md_flags &= ~G_ELI_FLAG_BOOT; sc->sc_flags &= ~G_ELI_FLAG_BOOT; } if (*notrim) { md.md_flags |= G_ELI_FLAG_NODELETE; sc->sc_flags |= G_ELI_FLAG_NODELETE; } else if (*trim) { md.md_flags &= ~G_ELI_FLAG_NODELETE; sc->sc_flags &= ~G_ELI_FLAG_NODELETE; } if (*geliboot) { md.md_flags |= G_ELI_FLAG_GELIBOOT; sc->sc_flags |= G_ELI_FLAG_GELIBOOT; } else if (*nogeliboot) { md.md_flags &= ~G_ELI_FLAG_GELIBOOT; sc->sc_flags &= ~G_ELI_FLAG_GELIBOOT; } if (*displaypass) { md.md_flags |= G_ELI_FLAG_GELIDISPLAYPASS; sc->sc_flags |= G_ELI_FLAG_GELIDISPLAYPASS; } else if (*nodisplaypass) { md.md_flags &= ~G_ELI_FLAG_GELIDISPLAYPASS; sc->sc_flags &= ~G_ELI_FLAG_GELIDISPLAYPASS; } if (sc->sc_flags & G_ELI_FLAG_ONETIME) { /* There's no metadata on disk so we are done here. */ continue; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { gctl_error(req, "Cannot store metadata on %s (error=%d).", prov, error); } explicit_bzero(&md, sizeof(md)); explicit_bzero(sector, pp->sectorsize); free(sector, M_ELI); } } static void g_eli_ctl_setkey(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *key, *mkeydst, *sector; intmax_t *valp; int keysize, nkey, error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { gctl_error(req, "No '%s' argument.", "key"); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot change keys for read-only provider."); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "keyno"); return; } if (*valp != -1) nkey = *valp; else nkey = sc->sc_nkey; if (nkey < 0 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } valp = gctl_get_paraml(req, "iterations", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "iterations"); return; } /* Check if iterations number should and can be changed. */ if (*valp != -1 && md.md_iterations == -1) { md.md_iterations = *valp; } else if (*valp != -1 && *valp != md.md_iterations) { if (bitcount32(md.md_keys) != 1) { gctl_error(req, "To be able to use '-i' option, only " "one key can be defined."); return; } if (md.md_keys != (1 << nkey)) { gctl_error(req, "Only already defined key can be " "changed when '-i' option is used."); return; } md.md_iterations = *valp; } mkeydst = md.md_mkeys + nkey * G_ELI_MKEYLEN; md.md_keys |= (1 << nkey); bcopy(sc->sc_mkey, mkeydst, sizeof(sc->sc_mkey)); /* Encrypt Master Key with the new key. */ error = g_eli_mkey_encrypt(md.md_ealgo, key, md.md_keylen, mkeydst); explicit_bzero(key, keysize); if (error != 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot encrypt Master Key (error=%d).", error); return; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); /* Store metadata with fresh key. */ eli_metadata_encode(&md, sector); explicit_bzero(&md, sizeof(md)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); explicit_bzero(sector, pp->sectorsize); free(sector, M_ELI); if (error != 0) { gctl_error(req, "Cannot store metadata on %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Key %u changed on %s.", nkey, pp->name); } static void g_eli_ctl_delkey(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; struct g_eli_metadata md; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *mkeydst, *sector; intmax_t *valp; size_t keysize; int error, nkey, *all, *force; u_int i; g_topology_assert(); nkey = 0; /* fixes causeless gcc warning */ name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } if (sc->sc_flags & G_ELI_FLAG_RO) { gctl_error(req, "Cannot delete keys for read-only provider."); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (*all) { mkeydst = md.md_mkeys; keysize = sizeof(md.md_mkeys); } else { force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } valp = gctl_get_paraml(req, "keyno", sizeof(*valp)); if (valp == NULL) { gctl_error(req, "No '%s' argument.", "keyno"); return; } if (*valp != -1) nkey = *valp; else nkey = sc->sc_nkey; if (nkey < 0 || nkey >= G_ELI_MAXMKEYS) { gctl_error(req, "Invalid '%s' argument.", "keyno"); return; } if (!(md.md_keys & (1 << nkey)) && !*force) { gctl_error(req, "Master Key %u is not set.", nkey); return; } md.md_keys &= ~(1 << nkey); if (md.md_keys == 0 && !*force) { gctl_error(req, "This is the last Master Key. Use '-f' " "flag if you really want to remove it."); return; } mkeydst = md.md_mkeys + nkey * G_ELI_MKEYLEN; keysize = G_ELI_MKEYLEN; } sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO); for (i = 0; i <= g_eli_overwrites; i++) { if (i == g_eli_overwrites) explicit_bzero(mkeydst, keysize); else arc4rand(mkeydst, keysize, 0); /* Store metadata with destroyed key. */ eli_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (error != 0) { G_ELI_DEBUG(0, "Cannot store metadata on %s " "(error=%d).", pp->name, error); } /* * Flush write cache so we don't overwrite data N times in cache * and only once on disk. */ (void)g_io_flush(cp); } explicit_bzero(&md, sizeof(md)); explicit_bzero(sector, pp->sectorsize); free(sector, M_ELI); if (*all) G_ELI_DEBUG(1, "All keys removed from %s.", pp->name); else G_ELI_DEBUG(1, "Key %d removed from %s.", nkey, pp->name); } static void g_eli_suspend_one(struct g_eli_softc *sc, struct gctl_req *req) { struct g_eli_worker *wr; g_topology_assert(); KASSERT(sc != NULL, ("NULL sc")); if (sc->sc_flags & G_ELI_FLAG_ONETIME) { gctl_error(req, "Device %s is using one-time key, suspend not supported.", sc->sc_name); return; } mtx_lock(&sc->sc_queue_mtx); if (sc->sc_flags & G_ELI_FLAG_SUSPEND) { mtx_unlock(&sc->sc_queue_mtx); gctl_error(req, "Device %s already suspended.", sc->sc_name); return; } sc->sc_flags |= G_ELI_FLAG_SUSPEND; wakeup(sc); for (;;) { LIST_FOREACH(wr, &sc->sc_workers, w_next) { if (wr->w_active) break; } if (wr == NULL) break; /* Not all threads suspended. */ msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO, "geli:suspend", 0); } /* * Clear sensitive data on suspend, they will be recovered on resume. */ explicit_bzero(sc->sc_mkey, sizeof(sc->sc_mkey)); g_eli_key_destroy(sc); explicit_bzero(sc->sc_akey, sizeof(sc->sc_akey)); explicit_bzero(&sc->sc_akeyctx, sizeof(sc->sc_akeyctx)); explicit_bzero(sc->sc_ivkey, sizeof(sc->sc_ivkey)); explicit_bzero(&sc->sc_ivctx, sizeof(sc->sc_ivctx)); mtx_unlock(&sc->sc_queue_mtx); G_ELI_DEBUG(0, "Device %s has been suspended.", sc->sc_name); } static void g_eli_ctl_suspend(struct gctl_req *req, struct g_class *mp) { struct g_eli_softc *sc; int *all, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (!*all && *nargs == 0) { gctl_error(req, "Too few arguments."); return; } if (*all) { struct g_geom *gp, *gp2; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { sc = gp->softc; if (sc->sc_flags & G_ELI_FLAG_ONETIME) { G_ELI_DEBUG(0, "Device %s is using one-time key, suspend not supported, skipping.", sc->sc_name); continue; } g_eli_suspend_one(sc, req); } } else { const char *prov; char param[16]; int i; for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { G_ELI_DEBUG(0, "No 'arg%d' argument.", i); continue; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { G_ELI_DEBUG(0, "No such provider: %s.", prov); continue; } g_eli_suspend_one(sc, req); } } } static void g_eli_ctl_resume(struct gctl_req *req, struct g_class *mp) { struct g_eli_metadata md; struct g_eli_softc *sc; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *key, mkey[G_ELI_DATAIVKEYLEN]; int *nargs, keysize, error; u_int nkey; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } key = gctl_get_param(req, "key", &keysize); if (key == NULL || keysize != G_ELI_USERKEYLEN) { gctl_error(req, "No '%s' argument.", "key"); return; } sc = g_eli_find_device(mp, name); if (sc == NULL) { gctl_error(req, "Provider %s is invalid.", name); return; } cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; error = g_eli_read_metadata(mp, pp, &md); if (error != 0) { gctl_error(req, "Cannot read metadata from %s (error=%d).", name, error); return; } if (md.md_keys == 0x00) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "No valid keys on %s.", pp->name); return; } error = g_eli_mkey_decrypt(&md, key, mkey, &nkey); explicit_bzero(key, keysize); if (error == -1) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Wrong key for %s.", pp->name); return; } else if (error > 0) { explicit_bzero(&md, sizeof(md)); gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).", pp->name, error); return; } G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name); mtx_lock(&sc->sc_queue_mtx); if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) gctl_error(req, "Device %s is not suspended.", name); else { /* Restore sc_mkey, sc_ekeys, sc_akey and sc_ivkey. */ g_eli_mkey_propagate(sc, mkey); sc->sc_flags &= ~G_ELI_FLAG_SUSPEND; G_ELI_DEBUG(1, "Resumed %s.", pp->name); wakeup(sc); } mtx_unlock(&sc->sc_queue_mtx); explicit_bzero(mkey, sizeof(mkey)); explicit_bzero(&md, sizeof(md)); } static int g_eli_kill_one(struct g_eli_softc *sc) { struct g_provider *pp; struct g_consumer *cp; int error = 0; g_topology_assert(); if (sc == NULL) return (ENOENT); pp = LIST_FIRST(&sc->sc_geom->provider); g_error_provider(pp, ENXIO); cp = LIST_FIRST(&sc->sc_geom->consumer); pp = cp->provider; if (sc->sc_flags & G_ELI_FLAG_RO) { G_ELI_DEBUG(0, "WARNING: Metadata won't be erased on read-only " "provider: %s.", pp->name); } else { u_char *sector; u_int i; int err; sector = malloc(pp->sectorsize, M_ELI, M_WAITOK); for (i = 0; i <= g_eli_overwrites; i++) { if (i == g_eli_overwrites) bzero(sector, pp->sectorsize); else arc4rand(sector, pp->sectorsize, 0); err = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); if (err != 0) { G_ELI_DEBUG(0, "Cannot erase metadata on %s " "(error=%d).", pp->name, err); if (error == 0) error = err; } /* * Flush write cache so we don't overwrite data N times * in cache and only once on disk. */ (void)g_io_flush(cp); } free(sector, M_ELI); } if (error == 0) G_ELI_DEBUG(0, "%s has been killed.", pp->name); g_eli_destroy(sc, TRUE); return (error); } static void g_eli_ctl_kill(struct gctl_req *req, struct g_class *mp) { int *all, *nargs; int error; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } all = gctl_get_paraml(req, "all", sizeof(*all)); if (all == NULL) { gctl_error(req, "No '%s' argument.", "all"); return; } if (!*all && *nargs == 0) { gctl_error(req, "Too few arguments."); return; } if (*all) { struct g_geom *gp, *gp2; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { error = g_eli_kill_one(gp->softc); if (error != 0) gctl_error(req, "Not fully done."); } } else { struct g_eli_softc *sc; const char *prov; char param[16]; int i; for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); prov = gctl_get_asciiparam(req, param); if (prov == NULL) { G_ELI_DEBUG(0, "No 'arg%d' argument.", i); continue; } sc = g_eli_find_device(mp, prov); if (sc == NULL) { G_ELI_DEBUG(0, "No such provider: %s.", prov); continue; } error = g_eli_kill_one(sc); if (error != 0) gctl_error(req, "Not fully done."); } } } void g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } while (*version != G_ELI_VERSION) { if (G_ELI_VERSION == G_ELI_VERSION_06 && *version == G_ELI_VERSION_05) { /* Compatible. */ break; } if (G_ELI_VERSION == G_ELI_VERSION_07 && (*version == G_ELI_VERSION_05 || *version == G_ELI_VERSION_06)) { /* Compatible. */ break; } gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "attach") == 0) g_eli_ctl_attach(req, mp); else if (strcmp(verb, "detach") == 0 || strcmp(verb, "stop") == 0) g_eli_ctl_detach(req, mp); else if (strcmp(verb, "onetime") == 0) g_eli_ctl_onetime(req, mp); else if (strcmp(verb, "configure") == 0) g_eli_ctl_configure(req, mp); else if (strcmp(verb, "setkey") == 0) g_eli_ctl_setkey(req, mp); else if (strcmp(verb, "delkey") == 0) g_eli_ctl_delkey(req, mp); else if (strcmp(verb, "suspend") == 0) g_eli_ctl_suspend(req, mp); else if (strcmp(verb, "resume") == 0) g_eli_ctl_resume(req, mp); else if (strcmp(verb, "kill") == 0) g_eli_ctl_kill(req, mp); else gctl_error(req, "Unknown verb."); } Index: head/sys/geom/eli/g_eli_integrity.c =================================================================== --- head/sys/geom/eli/g_eli_integrity.c (revision 326269) +++ head/sys/geom/eli/g_eli_integrity.c (revision 326270) @@ -1,538 +1,540 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The data layout description when integrity verification is configured. * * One of the most important assumption here is that authenticated data and its * HMAC has to be stored in the same place (namely in the same sector) to make * it work reliable. * The problem is that file systems work only with sectors that are multiple of * 512 bytes and a power of two number. * My idea to implement it is as follows. * Let's store HMAC in sector. This is a must. This leaves us 480 bytes for * data. We can't use that directly (ie. we can't create provider with 480 bytes * sector size). We need another sector from where we take only 32 bytes of data * and we store HMAC of this data as well. This takes two sectors from the * original provider at the input and leaves us one sector of authenticated data * at the output. Not very efficient, but you got the idea. * Now, let's assume, we want to create provider with 4096 bytes sector. * To output 4096 bytes of authenticated data we need 8x480 plus 1x256, so we * need nine 512-bytes sectors at the input to get one 4096-bytes sector at the * output. That's better. With 4096 bytes sector we can use 89% of size of the * original provider. I find it as an acceptable cost. * The reliability comes from the fact, that every HMAC stored inside the sector * is calculated only for the data in the same sector, so its impossible to * write new data and leave old HMAC or vice versa. * * And here is the picture: * * da0: +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+-----+ * |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |480b| |32b |256b | * |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data| |HMAC|Data | * +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+----+ +----+-----+ * |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |512 bytes| |288 bytes | * +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ +---------+ |224 unused| * +----------+ * da0.eli: +----+----+----+----+----+----+----+----+----+ * |480b|480b|480b|480b|480b|480b|480b|480b|256b| * +----+----+----+----+----+----+----+----+----+ * | 4096 bytes | * +--------------------------------------------+ * * PS. You can use any sector size with geli(8). My example is using 4kB, * because it's most efficient. For 8kB sectors you need 2 extra sectors, * so the cost is the same as for 4kB sectors. */ /* * Code paths: * BIO_READ: * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> g_eli_auth_run -> g_eli_auth_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_auth_run -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ MALLOC_DECLARE(M_ELI); /* * Here we generate key for HMAC. Every sector has its own HMAC key, so it is * not possible to copy sectors. * We cannot depend on fact, that every sector has its own IV, because different * IV doesn't change HMAC, when we use encrypt-then-authenticate method. */ static void g_eli_auth_keygen(struct g_eli_softc *sc, off_t offset, u_char *key) { SHA256_CTX ctx; /* Copy precalculated SHA256 context. */ bcopy(&sc->sc_akeyctx, &ctx, sizeof(ctx)); SHA256_Update(&ctx, (uint8_t *)&offset, sizeof(offset)); SHA256_Final(key, &ctx); } /* * The function is called after we read and decrypt data. * * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> g_eli_auth_run -> G_ELI_AUTH_READ_DONE -> g_io_deliver */ static int g_eli_auth_read_done(struct cryptop *crp) { struct g_eli_softc *sc; struct bio *bp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { bp->bio_completed += crp->crp_olen; G_ELI_DEBUG(3, "Crypto READ request done (%d/%d) (add=%jd completed=%jd).", bp->bio_inbed, bp->bio_children, (intmax_t)crp->crp_olen, (intmax_t)bp->bio_completed); } else { G_ELI_DEBUG(1, "Crypto READ request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } sc = bp->bio_to->geom->softc; g_eli_key_drop(sc, crp->crp_desc->crd_next->crd_key); /* * Do we have all sectors already? */ if (bp->bio_inbed < bp->bio_children) return (0); if (bp->bio_error == 0) { u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize; u_char *srcdata, *dstdata, *auth; off_t coroff, corsize; /* * Verify data integrity based on calculated and read HMACs. */ /* Sectorsize of decrypted provider eg. 4096. */ decr_secsize = bp->bio_to->sectorsize; /* The real sectorsize of encrypted provider, eg. 512. */ encr_secsize = LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize; /* Number of data bytes in one encrypted sector, eg. 480. */ data_secsize = sc->sc_data_per_sector; /* Number of sectors from decrypted provider, eg. 2. */ nsec = bp->bio_length / decr_secsize; /* Number of sectors from encrypted provider, eg. 18. */ nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize; /* Last sector number in every big sector, eg. 9. */ lsec = sc->sc_bytes_per_sector / encr_secsize; srcdata = bp->bio_driver2; dstdata = bp->bio_data; auth = srcdata + encr_secsize * nsec; coroff = -1; corsize = 0; for (i = 1; i <= nsec; i++) { data_secsize = sc->sc_data_per_sector; if ((i % lsec) == 0) data_secsize = decr_secsize % data_secsize; if (bcmp(srcdata, auth, sc->sc_alen) != 0) { /* * Curruption detected, remember the offset if * this is the first corrupted sector and * increase size. */ if (bp->bio_error == 0) bp->bio_error = -1; if (coroff == -1) { coroff = bp->bio_offset + (dstdata - (u_char *)bp->bio_data); } corsize += data_secsize; } else { /* * No curruption, good. * Report previous corruption if there was one. */ if (coroff != -1) { G_ELI_DEBUG(0, "%s: Failed to authenticate %jd " "bytes of data at offset %jd.", sc->sc_name, (intmax_t)corsize, (intmax_t)coroff); coroff = -1; corsize = 0; } bcopy(srcdata + sc->sc_alen, dstdata, data_secsize); } srcdata += encr_secsize; dstdata += data_secsize; auth += sc->sc_alen; } /* Report previous corruption if there was one. */ if (coroff != -1) { G_ELI_DEBUG(0, "%s: Failed to authenticate %jd " "bytes of data at offset %jd.", sc->sc_name, (intmax_t)corsize, (intmax_t)coroff); } } free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; if (bp->bio_error != 0) { if (bp->bio_error == -1) bp->bio_error = EINVAL; else { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); } bp->bio_completed = 0; } /* * Read is finished, send it up. */ g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return (0); } /* * The function is called after data encryption. * * g_eli_start -> g_eli_auth_run -> G_ELI_AUTH_WRITE_DONE -> g_io_request -> g_eli_write_done -> g_io_deliver */ static int g_eli_auth_write_done(struct cryptop *crp) { struct g_eli_softc *sc; struct g_consumer *cp; struct bio *bp, *cbp, *cbp2; u_int nsec; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto WRITE request done (%d/%d).", bp->bio_inbed, bp->bio_children); } else { G_ELI_DEBUG(1, "Crypto WRITE request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } sc = bp->bio_to->geom->softc; g_eli_key_drop(sc, crp->crp_desc->crd_key); /* * All sectors are already encrypted? */ if (bp->bio_inbed < bp->bio_children) return (0); if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; cbp = bp->bio_driver1; bp->bio_driver1 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return (0); } cp = LIST_FIRST(&sc->sc_geom->consumer); cbp = bp->bio_driver1; bp->bio_driver1 = NULL; cbp->bio_to = cp->provider; cbp->bio_done = g_eli_write_done; /* Number of sectors from decrypted provider, eg. 1. */ nsec = bp->bio_length / bp->bio_to->sectorsize; /* Number of sectors from encrypted provider, eg. 9. */ nsec = (nsec * sc->sc_bytes_per_sector) / cp->provider->sectorsize; cbp->bio_length = cp->provider->sectorsize * nsec; cbp->bio_offset = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector; cbp->bio_data = bp->bio_driver2; /* * We write more than what is requested, so we have to be ready to write * more than MAXPHYS. */ cbp2 = NULL; if (cbp->bio_length > MAXPHYS) { cbp2 = g_duplicate_bio(bp); cbp2->bio_length = cbp->bio_length - MAXPHYS; cbp2->bio_data = cbp->bio_data + MAXPHYS; cbp2->bio_offset = cbp->bio_offset + MAXPHYS; cbp2->bio_to = cp->provider; cbp2->bio_done = g_eli_write_done; cbp->bio_length = MAXPHYS; } /* * Send encrypted data to the provider. */ G_ELI_LOGREQ(2, cbp, "Sending request."); bp->bio_inbed = 0; bp->bio_children = (cbp2 != NULL ? 2 : 1); g_io_request(cbp, cp); if (cbp2 != NULL) { G_ELI_LOGREQ(2, cbp2, "Sending request."); g_io_request(cbp2, cp); } return (0); } void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp) { struct g_consumer *cp; struct bio *cbp, *cbp2; size_t size; off_t nsec; bp->bio_pflags = 0; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp = bp->bio_driver1; bp->bio_driver1 = NULL; cbp->bio_to = cp->provider; cbp->bio_done = g_eli_read_done; /* Number of sectors from decrypted provider, eg. 1. */ nsec = bp->bio_length / bp->bio_to->sectorsize; /* Number of sectors from encrypted provider, eg. 9. */ nsec = (nsec * sc->sc_bytes_per_sector) / cp->provider->sectorsize; cbp->bio_length = cp->provider->sectorsize * nsec; size = cbp->bio_length; size += sc->sc_alen * nsec; size += sizeof(struct cryptop) * nsec; size += sizeof(struct cryptodesc) * nsec * 2; size += G_ELI_AUTH_SECKEYLEN * nsec; cbp->bio_offset = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector; bp->bio_driver2 = malloc(size, M_ELI, M_WAITOK); cbp->bio_data = bp->bio_driver2; /* * We read more than what is requested, so we have to be ready to read * more than MAXPHYS. */ cbp2 = NULL; if (cbp->bio_length > MAXPHYS) { cbp2 = g_duplicate_bio(bp); cbp2->bio_length = cbp->bio_length - MAXPHYS; cbp2->bio_data = cbp->bio_data + MAXPHYS; cbp2->bio_offset = cbp->bio_offset + MAXPHYS; cbp2->bio_to = cp->provider; cbp2->bio_done = g_eli_read_done; cbp->bio_length = MAXPHYS; } /* * Read encrypted data from provider. */ G_ELI_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); if (cbp2 != NULL) { G_ELI_LOGREQ(2, cbp2, "Sending request."); g_io_request(cbp2, cp); } } /* * This is the main function responsible for cryptography (ie. communication * with crypto(9) subsystem). * * BIO_READ: * g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> G_ELI_AUTH_RUN -> g_eli_auth_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> G_ELI_AUTH_RUN -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ void g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp) { struct g_eli_softc *sc; struct cryptop *crp; struct cryptodesc *crde, *crda; u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize; off_t dstoff; u_char *p, *data, *auth, *authkey, *plaindata; int error; G_ELI_LOGREQ(3, bp, "%s", __func__); bp->bio_pflags = wr->w_number; sc = wr->w_softc; /* Sectorsize of decrypted provider eg. 4096. */ decr_secsize = bp->bio_to->sectorsize; /* The real sectorsize of encrypted provider, eg. 512. */ encr_secsize = LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize; /* Number of data bytes in one encrypted sector, eg. 480. */ data_secsize = sc->sc_data_per_sector; /* Number of sectors from decrypted provider, eg. 2. */ nsec = bp->bio_length / decr_secsize; /* Number of sectors from encrypted provider, eg. 18. */ nsec = (nsec * sc->sc_bytes_per_sector) / encr_secsize; /* Last sector number in every big sector, eg. 9. */ lsec = sc->sc_bytes_per_sector / encr_secsize; /* Destination offset, used for IV generation. */ dstoff = (bp->bio_offset / bp->bio_to->sectorsize) * sc->sc_bytes_per_sector; auth = NULL; /* Silence compiler warning. */ plaindata = bp->bio_data; if (bp->bio_cmd == BIO_READ) { data = bp->bio_driver2; auth = data + encr_secsize * nsec; p = auth + sc->sc_alen * nsec; } else { size_t size; size = encr_secsize * nsec; size += sizeof(*crp) * nsec; size += sizeof(*crde) * nsec; size += sizeof(*crda) * nsec; size += G_ELI_AUTH_SECKEYLEN * nsec; size += sizeof(uintptr_t); /* Space for alignment. */ data = malloc(size, M_ELI, M_WAITOK); bp->bio_driver2 = data; p = data + encr_secsize * nsec; } bp->bio_inbed = 0; bp->bio_children = nsec; #if defined(__mips_n64) || defined(__mips_o64) p = (char *)roundup((uintptr_t)p, sizeof(uintptr_t)); #endif for (i = 1; i <= nsec; i++, dstoff += encr_secsize) { crp = (struct cryptop *)p; p += sizeof(*crp); crde = (struct cryptodesc *)p; p += sizeof(*crde); crda = (struct cryptodesc *)p; p += sizeof(*crda); authkey = (u_char *)p; p += G_ELI_AUTH_SECKEYLEN; data_secsize = sc->sc_data_per_sector; if ((i % lsec) == 0) { data_secsize = decr_secsize % data_secsize; /* * Last encrypted sector of each decrypted sector is * only partially filled. */ if (bp->bio_cmd == BIO_WRITE) memset(data + sc->sc_alen + data_secsize, 0, encr_secsize - sc->sc_alen - data_secsize); } if (bp->bio_cmd == BIO_READ) { /* Remember read HMAC. */ bcopy(data, auth, sc->sc_alen); auth += sc->sc_alen; /* TODO: bzero(9) can be commented out later. */ bzero(data, sc->sc_alen); } else { bcopy(plaindata, data + sc->sc_alen, data_secsize); plaindata += data_secsize; } crp->crp_sid = wr->w_sid; crp->crp_ilen = sc->sc_alen + data_secsize; crp->crp_olen = data_secsize; crp->crp_opaque = (void *)bp; crp->crp_buf = (void *)data; data += encr_secsize; crp->crp_flags = CRYPTO_F_CBIFSYNC; if (g_eli_batch) crp->crp_flags |= CRYPTO_F_BATCH; if (bp->bio_cmd == BIO_WRITE) { crp->crp_callback = g_eli_auth_write_done; crp->crp_desc = crde; crde->crd_next = crda; crda->crd_next = NULL; } else { crp->crp_callback = g_eli_auth_read_done; crp->crp_desc = crda; crda->crd_next = crde; crde->crd_next = NULL; } crde->crd_skip = sc->sc_alen; crde->crd_len = data_secsize; crde->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) == 0) crde->crd_flags |= CRD_F_KEY_EXPLICIT; if (bp->bio_cmd == BIO_WRITE) crde->crd_flags |= CRD_F_ENCRYPT; crde->crd_alg = sc->sc_ealgo; crde->crd_key = g_eli_key_hold(sc, dstoff, encr_secsize); crde->crd_klen = sc->sc_ekeylen; if (sc->sc_ealgo == CRYPTO_AES_XTS) crde->crd_klen <<= 1; g_eli_crypto_ivgen(sc, dstoff, crde->crd_iv, sizeof(crde->crd_iv)); crda->crd_skip = sc->sc_alen; crda->crd_len = data_secsize; crda->crd_inject = 0; crda->crd_flags = CRD_F_KEY_EXPLICIT; crda->crd_alg = sc->sc_aalgo; g_eli_auth_keygen(sc, dstoff, authkey); crda->crd_key = authkey; crda->crd_klen = G_ELI_AUTH_SECKEYLEN * 8; crp->crp_etype = 0; error = crypto_dispatch(crp); KASSERT(error == 0, ("crypto_dispatch() failed (error=%d)", error)); } } Index: head/sys/geom/eli/g_eli_key.c =================================================================== --- head/sys/geom/eli/g_eli_key.c (revision 326269) +++ head/sys/geom/eli/g_eli_key.c (revision 326270) @@ -1,237 +1,239 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #else #include #include #include #include #include #include #endif #include #ifdef _KERNEL MALLOC_DECLARE(M_ELI); #endif /* * Verify if the given 'key' is correct. * Return 1 if it is correct and 0 otherwise. */ static int g_eli_mkey_verify(const unsigned char *mkey, const unsigned char *key) { const unsigned char *odhmac; /* On-disk HMAC. */ unsigned char chmac[SHA512_MDLEN]; /* Calculated HMAC. */ unsigned char hmkey[SHA512_MDLEN]; /* Key for HMAC. */ /* * The key for HMAC calculations is: hmkey = HMAC_SHA512(Derived-Key, 0) */ g_eli_crypto_hmac(key, G_ELI_USERKEYLEN, "\x00", 1, hmkey, 0); odhmac = mkey + G_ELI_DATAIVKEYLEN; /* Calculate HMAC from Data-Key and IV-Key. */ g_eli_crypto_hmac(hmkey, sizeof(hmkey), mkey, G_ELI_DATAIVKEYLEN, chmac, 0); explicit_bzero(hmkey, sizeof(hmkey)); /* * Compare calculated HMAC with HMAC from metadata. * If two HMACs are equal, 'key' is correct. */ return (!bcmp(odhmac, chmac, SHA512_MDLEN)); } /* * Calculate HMAC from Data-Key and IV-Key. */ void g_eli_mkey_hmac(unsigned char *mkey, const unsigned char *key) { unsigned char hmkey[SHA512_MDLEN]; /* Key for HMAC. */ unsigned char *odhmac; /* On-disk HMAC. */ /* * The key for HMAC calculations is: hmkey = HMAC_SHA512(Derived-Key, 0) */ g_eli_crypto_hmac(key, G_ELI_USERKEYLEN, "\x00", 1, hmkey, 0); odhmac = mkey + G_ELI_DATAIVKEYLEN; /* Calculate HMAC from Data-Key and IV-Key. */ g_eli_crypto_hmac(hmkey, sizeof(hmkey), mkey, G_ELI_DATAIVKEYLEN, odhmac, 0); explicit_bzero(hmkey, sizeof(hmkey)); } /* * Find and decrypt Master Key encrypted with 'key'. * Return decrypted Master Key number in 'nkeyp' if not NULL. * Return 0 on success, > 0 on failure, -1 on bad key. */ int g_eli_mkey_decrypt(const struct g_eli_metadata *md, const unsigned char *key, unsigned char *mkey, unsigned *nkeyp) { unsigned char tmpmkey[G_ELI_MKEYLEN]; unsigned char enckey[SHA512_MDLEN]; /* Key for encryption. */ const unsigned char *mmkey; int bit, error, nkey; if (nkeyp != NULL) *nkeyp = -1; /* * The key for encryption is: enckey = HMAC_SHA512(Derived-Key, 1) */ g_eli_crypto_hmac(key, G_ELI_USERKEYLEN, "\x01", 1, enckey, 0); mmkey = md->md_mkeys; for (nkey = 0; nkey < G_ELI_MAXMKEYS; nkey++, mmkey += G_ELI_MKEYLEN) { bit = (1 << nkey); if (!(md->md_keys & bit)) continue; bcopy(mmkey, tmpmkey, G_ELI_MKEYLEN); error = g_eli_crypto_decrypt(md->md_ealgo, tmpmkey, G_ELI_MKEYLEN, enckey, md->md_keylen); if (error != 0) { explicit_bzero(tmpmkey, sizeof(tmpmkey)); explicit_bzero(enckey, sizeof(enckey)); return (error); } if (g_eli_mkey_verify(tmpmkey, key)) { bcopy(tmpmkey, mkey, G_ELI_DATAIVKEYLEN); explicit_bzero(tmpmkey, sizeof(tmpmkey)); explicit_bzero(enckey, sizeof(enckey)); if (nkeyp != NULL) *nkeyp = nkey; return (0); } } explicit_bzero(enckey, sizeof(enckey)); explicit_bzero(tmpmkey, sizeof(tmpmkey)); return (-1); } /* * Encrypt the Master-Key and calculate HMAC to be able to verify it in the * future. */ int g_eli_mkey_encrypt(unsigned algo, const unsigned char *key, unsigned keylen, unsigned char *mkey) { unsigned char enckey[SHA512_MDLEN]; /* Key for encryption. */ int error; /* * To calculate HMAC, the whole key (G_ELI_USERKEYLEN bytes long) will * be used. */ g_eli_mkey_hmac(mkey, key); /* * The key for encryption is: enckey = HMAC_SHA512(Derived-Key, 1) */ g_eli_crypto_hmac(key, G_ELI_USERKEYLEN, "\x01", 1, enckey, 0); /* * Encrypt the Master-Key and HMAC() result with the given key (this * time only 'keylen' bits from the key are used). */ error = g_eli_crypto_encrypt(algo, mkey, G_ELI_MKEYLEN, enckey, keylen); explicit_bzero(enckey, sizeof(enckey)); return (error); } #ifdef _KERNEL /* * When doing encryption only, copy IV key and encryption key. * When doing encryption and authentication, copy IV key, generate encryption * key and generate authentication key. */ void g_eli_mkey_propagate(struct g_eli_softc *sc, const unsigned char *mkey) { /* Remember the Master Key. */ bcopy(mkey, sc->sc_mkey, sizeof(sc->sc_mkey)); bcopy(mkey, sc->sc_ivkey, sizeof(sc->sc_ivkey)); mkey += sizeof(sc->sc_ivkey); /* * The authentication key is: akey = HMAC_SHA512(Data-Key, 0x11) */ if ((sc->sc_flags & G_ELI_FLAG_AUTH) != 0) { g_eli_crypto_hmac(mkey, G_ELI_MAXKEYLEN, "\x11", 1, sc->sc_akey, 0); } else { arc4rand(sc->sc_akey, sizeof(sc->sc_akey), 0); } /* Initialize encryption keys. */ g_eli_key_init(sc); if ((sc->sc_flags & G_ELI_FLAG_AUTH) != 0) { /* * Precalculate SHA256 for HMAC key generation. * This is expensive operation and we can do it only once now or * for every access to sector, so now will be much better. */ SHA256_Init(&sc->sc_akeyctx); SHA256_Update(&sc->sc_akeyctx, sc->sc_akey, sizeof(sc->sc_akey)); } /* * Precalculate SHA256 for IV generation. * This is expensive operation and we can do it only once now or for * every access to sector, so now will be much better. */ switch (sc->sc_ealgo) { case CRYPTO_AES_XTS: break; default: SHA256_Init(&sc->sc_ivctx); SHA256_Update(&sc->sc_ivctx, sc->sc_ivkey, sizeof(sc->sc_ivkey)); break; } } #endif Index: head/sys/geom/eli/g_eli_key_cache.c =================================================================== --- head/sys/geom/eli/g_eli_key_cache.c (revision 326269) +++ head/sys/geom/eli/g_eli_key_cache.c (revision 326270) @@ -1,342 +1,344 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #ifdef _KERNEL MALLOC_DECLARE(M_ELI); SYSCTL_DECL(_kern_geom_eli); /* * The default limit (8192 keys) will allow to cache all keys for 4TB * provider with 512 bytes sectors and will take around 1MB of memory. */ static u_int g_eli_key_cache_limit = 8192; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, key_cache_limit, CTLFLAG_RDTUN, &g_eli_key_cache_limit, 0, "Maximum number of encryption keys to cache"); static uint64_t g_eli_key_cache_hits; SYSCTL_UQUAD(_kern_geom_eli, OID_AUTO, key_cache_hits, CTLFLAG_RW, &g_eli_key_cache_hits, 0, "Key cache hits"); static uint64_t g_eli_key_cache_misses; SYSCTL_UQUAD(_kern_geom_eli, OID_AUTO, key_cache_misses, CTLFLAG_RW, &g_eli_key_cache_misses, 0, "Key cache misses"); #endif /* _KERNEL */ static int g_eli_key_cmp(const struct g_eli_key *a, const struct g_eli_key *b) { if (a->gek_keyno > b->gek_keyno) return (1); else if (a->gek_keyno < b->gek_keyno) return (-1); return (0); } void g_eli_key_fill(struct g_eli_softc *sc, struct g_eli_key *key, uint64_t keyno) { const uint8_t *ekey; struct { char magic[4]; uint8_t keyno[8]; } __packed hmacdata; if ((sc->sc_flags & G_ELI_FLAG_ENC_IVKEY) != 0) ekey = sc->sc_mkey; else ekey = sc->sc_ekey; bcopy("ekey", hmacdata.magic, 4); le64enc(hmacdata.keyno, keyno); g_eli_crypto_hmac(ekey, G_ELI_MAXKEYLEN, (uint8_t *)&hmacdata, sizeof(hmacdata), key->gek_key, 0); key->gek_keyno = keyno; key->gek_count = 0; key->gek_magic = G_ELI_KEY_MAGIC; } #ifdef _KERNEL RB_PROTOTYPE(g_eli_key_tree, g_eli_key, gek_link, g_eli_key_cmp); RB_GENERATE(g_eli_key_tree, g_eli_key, gek_link, g_eli_key_cmp); static struct g_eli_key * g_eli_key_allocate(struct g_eli_softc *sc, uint64_t keyno) { struct g_eli_key *key, *ekey, keysearch; mtx_assert(&sc->sc_ekeys_lock, MA_OWNED); mtx_unlock(&sc->sc_ekeys_lock); key = malloc(sizeof(*key), M_ELI, M_WAITOK); g_eli_key_fill(sc, key, keyno); mtx_lock(&sc->sc_ekeys_lock); /* * Recheck if the key wasn't added while we weren't holding the lock. */ keysearch.gek_keyno = keyno; ekey = RB_FIND(g_eli_key_tree, &sc->sc_ekeys_tree, &keysearch); if (ekey != NULL) { explicit_bzero(key, sizeof(*key)); free(key, M_ELI); key = ekey; TAILQ_REMOVE(&sc->sc_ekeys_queue, key, gek_next); } else { RB_INSERT(g_eli_key_tree, &sc->sc_ekeys_tree, key); sc->sc_ekeys_allocated++; } TAILQ_INSERT_TAIL(&sc->sc_ekeys_queue, key, gek_next); return (key); } static struct g_eli_key * g_eli_key_find_last(struct g_eli_softc *sc) { struct g_eli_key *key; mtx_assert(&sc->sc_ekeys_lock, MA_OWNED); TAILQ_FOREACH(key, &sc->sc_ekeys_queue, gek_next) { if (key->gek_count == 0) break; } return (key); } static void g_eli_key_replace(struct g_eli_softc *sc, struct g_eli_key *key, uint64_t keyno) { mtx_assert(&sc->sc_ekeys_lock, MA_OWNED); KASSERT(key->gek_magic == G_ELI_KEY_MAGIC, ("Invalid magic.")); RB_REMOVE(g_eli_key_tree, &sc->sc_ekeys_tree, key); TAILQ_REMOVE(&sc->sc_ekeys_queue, key, gek_next); KASSERT(key->gek_count == 0, ("gek_count=%d", key->gek_count)); g_eli_key_fill(sc, key, keyno); RB_INSERT(g_eli_key_tree, &sc->sc_ekeys_tree, key); TAILQ_INSERT_TAIL(&sc->sc_ekeys_queue, key, gek_next); } static void g_eli_key_remove(struct g_eli_softc *sc, struct g_eli_key *key) { mtx_assert(&sc->sc_ekeys_lock, MA_OWNED); KASSERT(key->gek_magic == G_ELI_KEY_MAGIC, ("Invalid magic.")); KASSERT(key->gek_count == 0, ("gek_count=%d", key->gek_count)); RB_REMOVE(g_eli_key_tree, &sc->sc_ekeys_tree, key); TAILQ_REMOVE(&sc->sc_ekeys_queue, key, gek_next); sc->sc_ekeys_allocated--; explicit_bzero(key, sizeof(*key)); free(key, M_ELI); } void g_eli_key_init(struct g_eli_softc *sc) { uint8_t *mkey; mtx_lock(&sc->sc_ekeys_lock); mkey = sc->sc_mkey + sizeof(sc->sc_ivkey); if ((sc->sc_flags & G_ELI_FLAG_AUTH) == 0) bcopy(mkey, sc->sc_ekey, G_ELI_DATAKEYLEN); else { /* * The encryption key is: ekey = HMAC_SHA512(Data-Key, 0x10) */ g_eli_crypto_hmac(mkey, G_ELI_MAXKEYLEN, "\x10", 1, sc->sc_ekey, 0); } if ((sc->sc_flags & G_ELI_FLAG_SINGLE_KEY) != 0) { sc->sc_ekeys_total = 1; sc->sc_ekeys_allocated = 0; } else { off_t mediasize; size_t blocksize; if ((sc->sc_flags & G_ELI_FLAG_AUTH) != 0) { struct g_provider *pp; pp = LIST_FIRST(&sc->sc_geom->consumer)->provider; mediasize = pp->mediasize; blocksize = pp->sectorsize; } else { mediasize = sc->sc_mediasize; blocksize = sc->sc_sectorsize; } sc->sc_ekeys_total = ((mediasize - 1) >> G_ELI_KEY_SHIFT) / blocksize + 1; sc->sc_ekeys_allocated = 0; TAILQ_INIT(&sc->sc_ekeys_queue); RB_INIT(&sc->sc_ekeys_tree); if (sc->sc_ekeys_total <= g_eli_key_cache_limit) { uint64_t keyno; for (keyno = 0; keyno < sc->sc_ekeys_total; keyno++) (void)g_eli_key_allocate(sc, keyno); KASSERT(sc->sc_ekeys_total == sc->sc_ekeys_allocated, ("sc_ekeys_total=%ju != sc_ekeys_allocated=%ju", (uintmax_t)sc->sc_ekeys_total, (uintmax_t)sc->sc_ekeys_allocated)); } } mtx_unlock(&sc->sc_ekeys_lock); } void g_eli_key_destroy(struct g_eli_softc *sc) { mtx_lock(&sc->sc_ekeys_lock); if ((sc->sc_flags & G_ELI_FLAG_SINGLE_KEY) != 0) { explicit_bzero(sc->sc_ekey, sizeof(sc->sc_ekey)); } else { struct g_eli_key *key; while ((key = TAILQ_FIRST(&sc->sc_ekeys_queue)) != NULL) g_eli_key_remove(sc, key); TAILQ_INIT(&sc->sc_ekeys_queue); RB_INIT(&sc->sc_ekeys_tree); } mtx_unlock(&sc->sc_ekeys_lock); } /* * Select encryption key. If G_ELI_FLAG_SINGLE_KEY is present we only have one * key available for all the data. If the flag is not present select the key * based on data offset. */ uint8_t * g_eli_key_hold(struct g_eli_softc *sc, off_t offset, size_t blocksize) { struct g_eli_key *key, keysearch; uint64_t keyno; if ((sc->sc_flags & G_ELI_FLAG_SINGLE_KEY) != 0) return (sc->sc_ekey); /* We switch key every 2^G_ELI_KEY_SHIFT blocks. */ keyno = (offset >> G_ELI_KEY_SHIFT) / blocksize; KASSERT(keyno < sc->sc_ekeys_total, ("%s: keyno=%ju >= sc_ekeys_total=%ju", __func__, (uintmax_t)keyno, (uintmax_t)sc->sc_ekeys_total)); keysearch.gek_keyno = keyno; if (sc->sc_ekeys_total == sc->sc_ekeys_allocated) { /* We have all the keys, so avoid some overhead. */ key = RB_FIND(g_eli_key_tree, &sc->sc_ekeys_tree, &keysearch); KASSERT(key != NULL, ("No key %ju found.", (uintmax_t)keyno)); KASSERT(key->gek_magic == G_ELI_KEY_MAGIC, ("Invalid key magic.")); return (key->gek_key); } mtx_lock(&sc->sc_ekeys_lock); key = RB_FIND(g_eli_key_tree, &sc->sc_ekeys_tree, &keysearch); if (key != NULL) { g_eli_key_cache_hits++; TAILQ_REMOVE(&sc->sc_ekeys_queue, key, gek_next); TAILQ_INSERT_TAIL(&sc->sc_ekeys_queue, key, gek_next); } else { /* * No key in cache, find the least recently unreferenced key * or allocate one if we haven't reached our limit yet. */ if (sc->sc_ekeys_allocated < g_eli_key_cache_limit) { key = g_eli_key_allocate(sc, keyno); } else { g_eli_key_cache_misses++; key = g_eli_key_find_last(sc); if (key != NULL) { g_eli_key_replace(sc, key, keyno); } else { /* All keys are referenced? Allocate one. */ key = g_eli_key_allocate(sc, keyno); } } } key->gek_count++; mtx_unlock(&sc->sc_ekeys_lock); KASSERT(key->gek_magic == G_ELI_KEY_MAGIC, ("Invalid key magic.")); return (key->gek_key); } void g_eli_key_drop(struct g_eli_softc *sc, uint8_t *rawkey) { struct g_eli_key *key = (struct g_eli_key *)rawkey; if ((sc->sc_flags & G_ELI_FLAG_SINGLE_KEY) != 0) return; KASSERT(key->gek_magic == G_ELI_KEY_MAGIC, ("Invalid key magic.")); if (sc->sc_ekeys_total == sc->sc_ekeys_allocated) return; mtx_lock(&sc->sc_ekeys_lock); KASSERT(key->gek_count > 0, ("key->gek_count=%d", key->gek_count)); key->gek_count--; while (sc->sc_ekeys_allocated > g_eli_key_cache_limit) { key = g_eli_key_find_last(sc); if (key == NULL) break; g_eli_key_remove(sc, key); } mtx_unlock(&sc->sc_ekeys_lock); } #endif /* _KERNEL */ Index: head/sys/geom/eli/g_eli_privacy.c =================================================================== --- head/sys/geom/eli/g_eli_privacy.c (revision 326269) +++ head/sys/geom/eli/g_eli_privacy.c (revision 326270) @@ -1,314 +1,316 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2011 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Code paths: * BIO_READ: * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ MALLOC_DECLARE(M_ELI); /* * The function is called after we read and decrypt data. * * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver */ static int g_eli_crypto_read_done(struct cryptop *crp) { struct g_eli_softc *sc; struct bio *bp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto READ request done (%d/%d).", bp->bio_inbed, bp->bio_children); bp->bio_completed += crp->crp_olen; } else { G_ELI_DEBUG(1, "Crypto READ request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } sc = bp->bio_to->geom->softc; g_eli_key_drop(sc, crp->crp_desc->crd_key); /* * Do we have all sectors already? */ if (bp->bio_inbed < bp->bio_children) return (0); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); bp->bio_completed = 0; } /* * Read is finished, send it up. */ g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return (0); } /* * The function is called after data encryption. * * g_eli_start -> g_eli_crypto_run -> G_ELI_CRYPTO_WRITE_DONE -> g_io_request -> g_eli_write_done -> g_io_deliver */ static int g_eli_crypto_write_done(struct cryptop *crp) { struct g_eli_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct bio *bp, *cbp; if (crp->crp_etype == EAGAIN) { if (g_eli_crypto_rerun(crp) == 0) return (0); } bp = (struct bio *)crp->crp_opaque; bp->bio_inbed++; if (crp->crp_etype == 0) { G_ELI_DEBUG(3, "Crypto WRITE request done (%d/%d).", bp->bio_inbed, bp->bio_children); } else { G_ELI_DEBUG(1, "Crypto WRITE request failed (%d/%d) error=%d.", bp->bio_inbed, bp->bio_children, crp->crp_etype); if (bp->bio_error == 0) bp->bio_error = crp->crp_etype; } gp = bp->bio_to->geom; sc = gp->softc; g_eli_key_drop(sc, crp->crp_desc->crd_key); /* * All sectors are already encrypted? */ if (bp->bio_inbed < bp->bio_children) return (0); bp->bio_inbed = 0; bp->bio_children = 1; cbp = bp->bio_driver1; bp->bio_driver1 = NULL; if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); return (0); } cbp->bio_data = bp->bio_driver2; cbp->bio_done = g_eli_write_done; cp = LIST_FIRST(&gp->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); /* * Send encrypted data to the provider. */ g_io_request(cbp, cp); return (0); } /* * The function is called to read encrypted data. * * g_eli_start -> G_ELI_CRYPTO_READ -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver */ void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker) { struct g_consumer *cp; struct bio *cbp; if (!fromworker) { /* * We are not called from the worker thread, so check if * device is suspended. */ mtx_lock(&sc->sc_queue_mtx); if (sc->sc_flags & G_ELI_FLAG_SUSPEND) { /* * If device is suspended, we place the request onto * the queue, so it can be handled after resume. */ G_ELI_DEBUG(0, "device suspended, move onto queue"); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); return; } atomic_add_int(&sc->sc_inflight, 1); mtx_unlock(&sc->sc_queue_mtx); } bp->bio_pflags = 0; bp->bio_driver2 = NULL; cbp = bp->bio_driver1; cbp->bio_done = g_eli_read_done; cp = LIST_FIRST(&sc->sc_geom->consumer); cbp->bio_to = cp->provider; G_ELI_LOGREQ(2, cbp, "Sending request."); /* * Read encrypted data from provider. */ g_io_request(cbp, cp); } /* * This is the main function responsible for cryptography (ie. communication * with crypto(9) subsystem). * * BIO_READ: * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> G_ELI_CRYPTO_RUN -> g_eli_crypto_read_done -> g_io_deliver * BIO_WRITE: * g_eli_start -> G_ELI_CRYPTO_RUN -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver */ void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) { struct g_eli_softc *sc; struct cryptop *crp; struct cryptodesc *crd; u_int i, nsec, secsize; off_t dstoff; size_t size; u_char *p, *data; int error; G_ELI_LOGREQ(3, bp, "%s", __func__); bp->bio_pflags = wr->w_number; sc = wr->w_softc; secsize = LIST_FIRST(&sc->sc_geom->provider)->sectorsize; nsec = bp->bio_length / secsize; /* * Calculate how much memory do we need. * We need separate crypto operation for every single sector. * It is much faster to calculate total amount of needed memory here and * do the allocation once instead of allocating memory in pieces (many, * many pieces). */ size = sizeof(*crp) * nsec; size += sizeof(*crd) * nsec; /* * If we write the data we cannot destroy current bio_data content, * so we need to allocate more memory for encrypted data. */ if (bp->bio_cmd == BIO_WRITE) size += bp->bio_length; p = malloc(size, M_ELI, M_WAITOK); bp->bio_inbed = 0; bp->bio_children = nsec; bp->bio_driver2 = p; if (bp->bio_cmd == BIO_READ) data = bp->bio_data; else { data = p; p += bp->bio_length; bcopy(bp->bio_data, data, bp->bio_length); } for (i = 0, dstoff = bp->bio_offset; i < nsec; i++, dstoff += secsize) { crp = (struct cryptop *)p; p += sizeof(*crp); crd = (struct cryptodesc *)p; p += sizeof(*crd); crp->crp_sid = wr->w_sid; crp->crp_ilen = secsize; crp->crp_olen = secsize; crp->crp_opaque = (void *)bp; crp->crp_buf = (void *)data; data += secsize; if (bp->bio_cmd == BIO_WRITE) crp->crp_callback = g_eli_crypto_write_done; else /* if (bp->bio_cmd == BIO_READ) */ crp->crp_callback = g_eli_crypto_read_done; crp->crp_flags = CRYPTO_F_CBIFSYNC; if (g_eli_batch) crp->crp_flags |= CRYPTO_F_BATCH; crp->crp_desc = crd; crd->crd_skip = 0; crd->crd_len = secsize; crd->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; if ((sc->sc_flags & G_ELI_FLAG_SINGLE_KEY) == 0) crd->crd_flags |= CRD_F_KEY_EXPLICIT; if (bp->bio_cmd == BIO_WRITE) crd->crd_flags |= CRD_F_ENCRYPT; crd->crd_alg = sc->sc_ealgo; crd->crd_key = g_eli_key_hold(sc, dstoff, secsize); crd->crd_klen = sc->sc_ekeylen; if (sc->sc_ealgo == CRYPTO_AES_XTS) crd->crd_klen <<= 1; g_eli_crypto_ivgen(sc, dstoff, crd->crd_iv, sizeof(crd->crd_iv)); crd->crd_next = NULL; crp->crp_etype = 0; error = crypto_dispatch(crp); KASSERT(error == 0, ("crypto_dispatch() failed (error=%d)", error)); } } Index: head/sys/geom/eli/pkcs5v2.c =================================================================== --- head/sys/geom/eli/pkcs5v2.c (revision 326269) +++ head/sys/geom/eli/pkcs5v2.c (revision 326270) @@ -1,129 +1,131 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #else #include #include #include #endif #include #include static __inline void xor(uint8_t *dst, const uint8_t *src, size_t size) { for (; size > 0; size--) *dst++ ^= *src++; } void pkcs5v2_genkey(uint8_t *key, unsigned keylen, const uint8_t *salt, size_t saltsize, const char *passphrase, u_int iterations) { uint8_t md[SHA512_MDLEN], saltcount[saltsize + sizeof(uint32_t)]; uint8_t *counter, *keyp; u_int i, bsize, passlen; uint32_t count; struct hmac_ctx startpoint, ctx; passlen = strlen(passphrase); bzero(key, keylen); bcopy(salt, saltcount, saltsize); counter = saltcount + saltsize; keyp = key; for (count = 1; keylen > 0; count++, keylen -= bsize, keyp += bsize) { bsize = MIN(keylen, sizeof(md)); be32enc(counter, count); g_eli_crypto_hmac_init(&startpoint, passphrase, passlen); ctx = startpoint; g_eli_crypto_hmac_update(&ctx, saltcount, sizeof(saltcount)); g_eli_crypto_hmac_final(&ctx, md, sizeof(md)); xor(keyp, md, bsize); for(i = 1; i < iterations; i++) { ctx = startpoint; g_eli_crypto_hmac_update(&ctx, md, sizeof(md)); g_eli_crypto_hmac_final(&ctx, md, sizeof(md)); xor(keyp, md, bsize); } } explicit_bzero(&startpoint, sizeof(startpoint)); explicit_bzero(&ctx, sizeof(ctx)); } #ifndef _KERNEL #ifndef _STAND /* * Return the number of microseconds needed for 'interations' iterations. */ static int pkcs5v2_probe(int iterations) { uint8_t key[G_ELI_USERKEYLEN], salt[G_ELI_SALTLEN]; uint8_t passphrase[] = "passphrase"; struct rusage start, end; int usecs; getrusage(RUSAGE_SELF, &start); pkcs5v2_genkey(key, sizeof(key), salt, sizeof(salt), passphrase, iterations); getrusage(RUSAGE_SELF, &end); usecs = end.ru_utime.tv_sec - start.ru_utime.tv_sec; usecs *= 1000000; usecs += end.ru_utime.tv_usec - start.ru_utime.tv_usec; return (usecs); } /* * Return the number of iterations which takes 'usecs' microseconds. */ int pkcs5v2_calculate(int usecs) { int iterations, v; for (iterations = 1; ; iterations <<= 1) { v = pkcs5v2_probe(iterations); if (v > 2000000) break; } return (((intmax_t)iterations * (intmax_t)usecs) / v); } #endif /* !_STAND */ #endif /* !_KERNEL */ Index: head/sys/geom/eli/pkcs5v2.h =================================================================== --- head/sys/geom/eli/pkcs5v2.h (revision 326269) +++ head/sys/geom/eli/pkcs5v2.h (revision 326270) @@ -1,36 +1,38 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PKCS5V2_H_ #define _PKCS5V2_H_ void pkcs5v2_genkey(uint8_t *key, unsigned keylen, const uint8_t *salt, size_t saltsize, const char *passphrase, u_int iterations); #ifndef _KERNEL int pkcs5v2_calculate(int usecs); #endif #endif /* !_PKCS5V2_H_ */ Index: head/sys/geom/gate/g_gate.c =================================================================== --- head/sys/geom/gate/g_gate.c (revision 326269) +++ head/sys/geom/gate/g_gate.c (revision 326270) @@ -1,964 +1,966 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * Copyright (c) 2009-2010 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Pawel Jakub Dawidek * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_gate, "GEOM Gate module"); static MALLOC_DEFINE(M_GATE, "gg_data", "GEOM Gate Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, gate, CTLFLAG_RW, 0, "GEOM_GATE configuration"); static int g_gate_debug = 0; SYSCTL_INT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RWTUN, &g_gate_debug, 0, "Debug level"); static u_int g_gate_maxunits = 256; SYSCTL_UINT(_kern_geom_gate, OID_AUTO, maxunits, CTLFLAG_RDTUN, &g_gate_maxunits, 0, "Maximum number of ggate devices"); struct g_class g_gate_class = { .name = G_GATE_CLASS_NAME, .version = G_VERSION, }; static struct cdev *status_dev; static d_ioctl_t g_gate_ioctl; static struct cdevsw g_gate_cdevsw = { .d_version = D_VERSION, .d_ioctl = g_gate_ioctl, .d_name = G_GATE_CTL_NAME }; static struct g_gate_softc **g_gate_units; static u_int g_gate_nunits; static struct mtx g_gate_units_lock; static int g_gate_destroy(struct g_gate_softc *sc, boolean_t force) { struct bio_queue_head queue; struct g_provider *pp; struct g_consumer *cp; struct g_geom *gp; struct bio *bp; g_topology_assert(); mtx_assert(&g_gate_units_lock, MA_OWNED); pp = sc->sc_provider; if (!force && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { mtx_unlock(&g_gate_units_lock); return (EBUSY); } mtx_unlock(&g_gate_units_lock); mtx_lock(&sc->sc_queue_mtx); if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) sc->sc_flags |= G_GATE_FLAG_DESTROY; wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); gp = pp->geom; g_wither_provider(pp, ENXIO); callout_drain(&sc->sc_callout); bioq_init(&queue); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&sc->sc_inqueue)) != NULL) { sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } while ((bp = bioq_takefirst(&sc->sc_outqueue)) != NULL) { sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } mtx_unlock(&sc->sc_queue_mtx); g_topology_unlock(); while ((bp = bioq_takefirst(&queue)) != NULL) { G_GATE_LOGREQ(1, bp, "Request canceled."); g_io_deliver(bp, ENXIO); } mtx_lock(&g_gate_units_lock); /* One reference is ours. */ sc->sc_ref--; while (sc->sc_ref > 0) msleep(&sc->sc_ref, &g_gate_units_lock, 0, "gg:destroy", 0); g_gate_units[sc->sc_unit] = NULL; KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?")); g_gate_nunits--; mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); g_topology_lock(); if ((cp = sc->sc_readcons) != NULL) { sc->sc_readcons = NULL; (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } G_GATE_DEBUG(1, "Device %s destroyed.", gp->name); gp->softc = NULL; g_wither_geom(gp, ENXIO); sc->sc_provider = NULL; free(sc, M_GATE); return (0); } static int g_gate_access(struct g_provider *pp, int dr, int dw, int de) { struct g_gate_softc *sc; if (dr <= 0 && dw <= 0 && de <= 0) return (0); sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) return (ENXIO); /* XXX: Hack to allow read-only mounts. */ #if 0 if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0 && dw > 0) return (EPERM); #endif if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0 && dr > 0) return (EPERM); return (0); } static void g_gate_queue_io(struct bio *bp) { struct g_gate_softc *sc; sc = bp->bio_to->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { g_io_deliver(bp, ENXIO); return; } mtx_lock(&sc->sc_queue_mtx); if (sc->sc_queue_size > 0 && sc->sc_queue_count > sc->sc_queue_size) { mtx_unlock(&sc->sc_queue_mtx); G_GATE_LOGREQ(1, bp, "Queue full, request canceled."); g_io_deliver(bp, ENOMEM); return; } bp->bio_driver1 = (void *)sc->sc_seq; sc->sc_seq++; sc->sc_queue_count++; bioq_insert_tail(&sc->sc_inqueue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } static void g_gate_done(struct bio *cbp) { struct bio *pbp; pbp = cbp->bio_parent; if (cbp->bio_error == 0) { pbp->bio_completed = cbp->bio_completed; g_destroy_bio(cbp); pbp->bio_inbed++; g_io_deliver(pbp, 0); } else { /* If direct read failed, pass it through userland daemon. */ g_destroy_bio(cbp); pbp->bio_children--; g_gate_queue_io(pbp); } } static void g_gate_start(struct bio *pbp) { struct g_gate_softc *sc; sc = pbp->bio_to->geom->softc; if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { g_io_deliver(pbp, ENXIO); return; } G_GATE_LOGREQ(2, pbp, "Request received."); switch (pbp->bio_cmd) { case BIO_READ: if (sc->sc_readcons != NULL) { struct bio *cbp; cbp = g_clone_bio(pbp); if (cbp == NULL) { g_io_deliver(pbp, ENOMEM); return; } cbp->bio_done = g_gate_done; cbp->bio_offset = pbp->bio_offset + sc->sc_readoffset; cbp->bio_to = sc->sc_readcons->provider; g_io_request(cbp, sc->sc_readcons); return; } break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: /* XXX: Hack to allow read-only mounts. */ if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) { g_io_deliver(pbp, EPERM); return; } break; case BIO_GETATTR: default: G_GATE_LOGREQ(2, pbp, "Ignoring request."); g_io_deliver(pbp, EOPNOTSUPP); return; } g_gate_queue_io(pbp); } static struct g_gate_softc * g_gate_hold(int unit, const char *name) { struct g_gate_softc *sc = NULL; mtx_lock(&g_gate_units_lock); if (unit >= 0 && unit < g_gate_maxunits) sc = g_gate_units[unit]; else if (unit == G_GATE_NAME_GIVEN) { KASSERT(name != NULL, ("name is NULL")); for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) continue; if (strcmp(name, g_gate_units[unit]->sc_provider->name) != 0) { continue; } sc = g_gate_units[unit]; break; } } if (sc != NULL) sc->sc_ref++; mtx_unlock(&g_gate_units_lock); return (sc); } static void g_gate_release(struct g_gate_softc *sc) { g_topology_assert_not(); mtx_lock(&g_gate_units_lock); sc->sc_ref--; KASSERT(sc->sc_ref >= 0, ("Negative sc_ref for %s.", sc->sc_name)); if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) wakeup(&sc->sc_ref); mtx_unlock(&g_gate_units_lock); } static int g_gate_getunit(int unit, int *errorp) { mtx_assert(&g_gate_units_lock, MA_OWNED); if (unit >= 0) { if (unit >= g_gate_maxunits) *errorp = EINVAL; else if (g_gate_units[unit] == NULL) return (unit); else *errorp = EEXIST; } else { for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) return (unit); } *errorp = ENFILE; } return (-1); } static void g_gate_guard(void *arg) { struct bio_queue_head queue; struct g_gate_softc *sc; struct bintime curtime; struct bio *bp, *bp2; sc = arg; binuptime(&curtime); g_gate_hold(sc->sc_unit, NULL); bioq_init(&queue); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_inqueue.queue, bio_queue, bp2) { if (curtime.sec - bp->bio_t0.sec < 5) continue; bioq_remove(&sc->sc_inqueue, bp); sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, bp2) { if (curtime.sec - bp->bio_t0.sec < 5) continue; bioq_remove(&sc->sc_outqueue, bp); sc->sc_queue_count--; bioq_insert_tail(&queue, bp); } mtx_unlock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&queue)) != NULL) { G_GATE_LOGREQ(1, bp, "Request timeout."); g_io_deliver(bp, EIO); } if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) { callout_reset(&sc->sc_callout, sc->sc_timeout * hz, g_gate_guard, sc); } g_gate_release(sc); } static void g_gate_orphan(struct g_consumer *cp) { struct g_gate_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; KASSERT(cp == sc->sc_readcons, ("cp=%p sc_readcons=%p", cp, sc->sc_readcons)); sc->sc_readcons = NULL; G_GATE_DEBUG(1, "Destroying read consumer on provider %s orphan.", cp->provider->name); (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } static void g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_gate_softc *sc; sc = gp->softc; if (sc == NULL || pp != NULL || cp != NULL) return; sc = g_gate_hold(sc->sc_unit, NULL); if (sc == NULL) return; if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) { sbuf_printf(sb, "%s%s\n", indent, "read-only"); } else if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0) { sbuf_printf(sb, "%s%s\n", indent, "write-only"); } else { sbuf_printf(sb, "%s%s\n", indent, "read-write"); } if (sc->sc_readcons != NULL) { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->sc_readoffset); sbuf_printf(sb, "%s%s\n", indent, sc->sc_readcons->provider->name); } sbuf_printf(sb, "%s%u\n", indent, sc->sc_timeout); sbuf_printf(sb, "%s%s\n", indent, sc->sc_info); sbuf_printf(sb, "%s%u\n", indent, sc->sc_queue_count); sbuf_printf(sb, "%s%u\n", indent, sc->sc_queue_size); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ref); sbuf_printf(sb, "%s%d\n", indent, sc->sc_unit); g_topology_unlock(); g_gate_release(sc); g_topology_lock(); } static int g_gate_create(struct g_gate_ctl_create *ggio) { struct g_gate_softc *sc; struct g_geom *gp; struct g_provider *pp, *ropp; struct g_consumer *cp; char name[NAME_MAX]; int error = 0, unit; if (ggio->gctl_mediasize <= 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } if (ggio->gctl_sectorsize <= 0) { G_GATE_DEBUG(1, "Invalid sector size."); return (EINVAL); } if (!powerof2(ggio->gctl_sectorsize)) { G_GATE_DEBUG(1, "Invalid sector size."); return (EINVAL); } if ((ggio->gctl_mediasize % ggio->gctl_sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } if ((ggio->gctl_flags & G_GATE_FLAG_READONLY) != 0 && (ggio->gctl_flags & G_GATE_FLAG_WRITEONLY) != 0) { G_GATE_DEBUG(1, "Invalid flags."); return (EINVAL); } if (ggio->gctl_unit != G_GATE_UNIT_AUTO && ggio->gctl_unit != G_GATE_NAME_GIVEN && ggio->gctl_unit < 0) { G_GATE_DEBUG(1, "Invalid unit number."); return (EINVAL); } if (ggio->gctl_unit == G_GATE_NAME_GIVEN && ggio->gctl_name[0] == '\0') { G_GATE_DEBUG(1, "No device name."); return (EINVAL); } sc = malloc(sizeof(*sc), M_GATE, M_WAITOK | M_ZERO); sc->sc_flags = (ggio->gctl_flags & G_GATE_USERFLAGS); strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info)); sc->sc_seq = 1; bioq_init(&sc->sc_inqueue); bioq_init(&sc->sc_outqueue); mtx_init(&sc->sc_queue_mtx, "gg:queue", NULL, MTX_DEF); sc->sc_queue_count = 0; sc->sc_queue_size = ggio->gctl_maxcount; if (sc->sc_queue_size > G_GATE_MAX_QUEUE_SIZE) sc->sc_queue_size = G_GATE_MAX_QUEUE_SIZE; sc->sc_timeout = ggio->gctl_timeout; callout_init(&sc->sc_callout, 1); mtx_lock(&g_gate_units_lock); sc->sc_unit = g_gate_getunit(ggio->gctl_unit, &error); if (sc->sc_unit < 0) goto fail1; if (ggio->gctl_unit == G_GATE_NAME_GIVEN) snprintf(name, sizeof(name), "%s", ggio->gctl_name); else { snprintf(name, sizeof(name), "%s%d", G_GATE_PROVIDER_NAME, sc->sc_unit); } /* Check for name collision. */ for (unit = 0; unit < g_gate_maxunits; unit++) { if (g_gate_units[unit] == NULL) continue; if (strcmp(name, g_gate_units[unit]->sc_name) != 0) continue; error = EEXIST; goto fail1; } sc->sc_name = name; g_gate_units[sc->sc_unit] = sc; g_gate_nunits++; mtx_unlock(&g_gate_units_lock); g_topology_lock(); if (ggio->gctl_readprov[0] == '\0') { ropp = NULL; } else { ropp = g_provider_by_name(ggio->gctl_readprov); if (ropp == NULL) { G_GATE_DEBUG(1, "Provider %s doesn't exist.", ggio->gctl_readprov); error = EINVAL; goto fail2; } if ((ggio->gctl_readoffset % ggio->gctl_sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid read offset."); error = EINVAL; goto fail2; } if (ggio->gctl_mediasize + ggio->gctl_readoffset > ropp->mediasize) { G_GATE_DEBUG(1, "Invalid read offset or media size."); error = EINVAL; goto fail2; } } gp = g_new_geomf(&g_gate_class, "%s", name); gp->start = g_gate_start; gp->access = g_gate_access; gp->orphan = g_gate_orphan; gp->dumpconf = g_gate_dumpconf; gp->softc = sc; if (ropp != NULL) { cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, ropp); if (error != 0) { G_GATE_DEBUG(1, "Unable to attach to %s.", ropp->name); goto fail3; } error = g_access(cp, 1, 0, 0); if (error != 0) { G_GATE_DEBUG(1, "Unable to access %s.", ropp->name); g_detach(cp); goto fail3; } sc->sc_readcons = cp; sc->sc_readoffset = ggio->gctl_readoffset; } ggio->gctl_unit = sc->sc_unit; pp = g_new_providerf(gp, "%s", name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; pp->mediasize = ggio->gctl_mediasize; pp->sectorsize = ggio->gctl_sectorsize; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); mtx_lock(&g_gate_units_lock); sc->sc_name = sc->sc_provider->name; mtx_unlock(&g_gate_units_lock); G_GATE_DEBUG(1, "Device %s created.", gp->name); if (sc->sc_timeout > 0) { callout_reset(&sc->sc_callout, sc->sc_timeout * hz, g_gate_guard, sc); } return (0); fail3: g_destroy_consumer(cp); g_destroy_geom(gp); fail2: g_topology_unlock(); mtx_lock(&g_gate_units_lock); g_gate_units[sc->sc_unit] = NULL; KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?")); g_gate_nunits--; fail1: mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); free(sc, M_GATE); return (error); } static int g_gate_modify(struct g_gate_softc *sc, struct g_gate_ctl_modify *ggio) { struct g_provider *pp; struct g_consumer *cp; int error; if ((ggio->gctl_modify & GG_MODIFY_MEDIASIZE) != 0) { if (ggio->gctl_mediasize <= 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } pp = sc->sc_provider; if ((ggio->gctl_mediasize % pp->sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid media size."); return (EINVAL); } /* TODO */ return (EOPNOTSUPP); } if ((ggio->gctl_modify & GG_MODIFY_INFO) != 0) (void)strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info)); cp = NULL; if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) { g_topology_lock(); if (sc->sc_readcons != NULL) { cp = sc->sc_readcons; sc->sc_readcons = NULL; (void)g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); } if (ggio->gctl_readprov[0] != '\0') { pp = g_provider_by_name(ggio->gctl_readprov); if (pp == NULL) { g_topology_unlock(); G_GATE_DEBUG(1, "Provider %s doesn't exist.", ggio->gctl_readprov); return (EINVAL); } cp = g_new_consumer(sc->sc_provider->geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { G_GATE_DEBUG(1, "Unable to attach to %s.", pp->name); } else { error = g_access(cp, 1, 0, 0); if (error != 0) { G_GATE_DEBUG(1, "Unable to access %s.", pp->name); g_detach(cp); } } if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } } } else { cp = sc->sc_readcons; } if ((ggio->gctl_modify & GG_MODIFY_READOFFSET) != 0) { if (cp == NULL) { G_GATE_DEBUG(1, "No read provider."); return (EINVAL); } pp = sc->sc_provider; if ((ggio->gctl_readoffset % pp->sectorsize) != 0) { G_GATE_DEBUG(1, "Invalid read offset."); return (EINVAL); } if (pp->mediasize + ggio->gctl_readoffset > cp->provider->mediasize) { G_GATE_DEBUG(1, "Invalid read offset or media size."); return (EINVAL); } sc->sc_readoffset = ggio->gctl_readoffset; } if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) { sc->sc_readcons = cp; g_topology_unlock(); } return (0); } #define G_GATE_CHECK_VERSION(ggio) do { \ if ((ggio)->gctl_version != G_GATE_VERSION) { \ printf("Version mismatch %d != %d.\n", \ ggio->gctl_version, G_GATE_VERSION); \ return (EINVAL); \ } \ } while (0) static int g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct g_gate_softc *sc; struct bio *bp; int error = 0; G_GATE_DEBUG(4, "ioctl(%s, %lx, %p, %x, %p)", devtoname(dev), cmd, addr, flags, td); switch (cmd) { case G_GATE_CMD_CREATE: { struct g_gate_ctl_create *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); error = g_gate_create(ggio); /* * Reset TDP_GEOM flag. * There are pending events for sure, because we just created * new provider and other classes want to taste it, but we * cannot answer on I/O requests until we're here. */ td->td_pflags &= ~TDP_GEOM; return (error); } case G_GATE_CMD_MODIFY: { struct g_gate_ctl_modify *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENXIO); error = g_gate_modify(sc, ggio); g_gate_release(sc); return (error); } case G_GATE_CMD_DESTROY: { struct g_gate_ctl_destroy *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); g_topology_lock(); mtx_lock(&g_gate_units_lock); error = g_gate_destroy(sc, ggio->gctl_force); g_topology_unlock(); if (error != 0) g_gate_release(sc); return (error); } case G_GATE_CMD_CANCEL: { struct g_gate_ctl_cancel *ggio = (void *)addr; struct bio *tbp, *lbp; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); lbp = NULL; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, tbp) { if (ggio->gctl_seq == 0 || ggio->gctl_seq == (uintptr_t)bp->bio_driver1) { G_GATE_LOGREQ(1, bp, "Request canceled."); bioq_remove(&sc->sc_outqueue, bp); /* * Be sure to put requests back onto incoming * queue in the proper order. */ if (lbp == NULL) bioq_insert_head(&sc->sc_inqueue, bp); else { TAILQ_INSERT_AFTER(&sc->sc_inqueue.queue, lbp, bp, bio_queue); } lbp = bp; /* * If only one request was canceled, leave now. */ if (ggio->gctl_seq != 0) break; } } if (ggio->gctl_unit == G_GATE_NAME_GIVEN) ggio->gctl_unit = sc->sc_unit; mtx_unlock(&sc->sc_queue_mtx); g_gate_release(sc); return (error); } case G_GATE_CMD_START: { struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENXIO); error = 0; for (;;) { mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_inqueue); if (bp != NULL) break; if ((sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { ggio->gctl_error = ECANCELED; mtx_unlock(&sc->sc_queue_mtx); goto start_end; } if (msleep(sc, &sc->sc_queue_mtx, PPAUSE | PDROP | PCATCH, "ggwait", 0) != 0) { ggio->gctl_error = ECANCELED; goto start_end; } } ggio->gctl_cmd = bp->bio_cmd; if (bp->bio_cmd == BIO_WRITE && bp->bio_length > ggio->gctl_length) { mtx_unlock(&sc->sc_queue_mtx); ggio->gctl_length = bp->bio_length; ggio->gctl_error = ENOMEM; goto start_end; } bioq_remove(&sc->sc_inqueue, bp); bioq_insert_tail(&sc->sc_outqueue, bp); mtx_unlock(&sc->sc_queue_mtx); ggio->gctl_seq = (uintptr_t)bp->bio_driver1; ggio->gctl_offset = bp->bio_offset; ggio->gctl_length = bp->bio_length; switch (bp->bio_cmd) { case BIO_READ: case BIO_DELETE: case BIO_FLUSH: break; case BIO_WRITE: error = copyout(bp->bio_data, ggio->gctl_data, bp->bio_length); if (error != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_remove(&sc->sc_outqueue, bp); bioq_insert_head(&sc->sc_inqueue, bp); mtx_unlock(&sc->sc_queue_mtx); goto start_end; } break; } start_end: g_gate_release(sc); return (error); } case G_GATE_CMD_DONE: { struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENOENT); error = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_outqueue.queue, bio_queue) { if (ggio->gctl_seq == (uintptr_t)bp->bio_driver1) break; } if (bp != NULL) { bioq_remove(&sc->sc_outqueue, bp); sc->sc_queue_count--; } mtx_unlock(&sc->sc_queue_mtx); if (bp == NULL) { /* * Request was probably canceled. */ goto done_end; } if (ggio->gctl_error == EAGAIN) { bp->bio_error = 0; G_GATE_LOGREQ(1, bp, "Request desisted."); mtx_lock(&sc->sc_queue_mtx); sc->sc_queue_count++; bioq_insert_head(&sc->sc_inqueue, bp); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); } else { bp->bio_error = ggio->gctl_error; if (bp->bio_error == 0) { bp->bio_completed = bp->bio_length; switch (bp->bio_cmd) { case BIO_READ: error = copyin(ggio->gctl_data, bp->bio_data, bp->bio_length); if (error != 0) bp->bio_error = error; break; case BIO_DELETE: case BIO_WRITE: case BIO_FLUSH: break; } } G_GATE_LOGREQ(2, bp, "Request done."); g_io_deliver(bp, bp->bio_error); } done_end: g_gate_release(sc); return (error); } } return (ENOIOCTL); } static void g_gate_device(void) { status_dev = make_dev(&g_gate_cdevsw, 0x0, UID_ROOT, GID_WHEEL, 0600, G_GATE_CTL_NAME); } static int g_gate_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: mtx_init(&g_gate_units_lock, "gg_units_lock", NULL, MTX_DEF); g_gate_units = malloc(g_gate_maxunits * sizeof(g_gate_units[0]), M_GATE, M_WAITOK | M_ZERO); g_gate_nunits = 0; g_gate_device(); break; case MOD_UNLOAD: mtx_lock(&g_gate_units_lock); if (g_gate_nunits > 0) { mtx_unlock(&g_gate_units_lock); error = EBUSY; break; } mtx_unlock(&g_gate_units_lock); mtx_destroy(&g_gate_units_lock); if (status_dev != NULL) destroy_dev(status_dev); free(g_gate_units, M_GATE); break; default: return (EOPNOTSUPP); break; } return (error); } static moduledata_t g_gate_module = { G_GATE_MOD_NAME, g_gate_modevent, NULL }; DECLARE_MODULE(geom_gate, g_gate_module, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); DECLARE_GEOM_CLASS(g_gate_class, g_gate); Index: head/sys/geom/gate/g_gate.h =================================================================== --- head/sys/geom/gate/g_gate.h (revision 326269) +++ head/sys/geom/gate/g_gate.h (revision 326270) @@ -1,180 +1,182 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2009 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_GATE_H_ #define _G_GATE_H_ #include #include #include #include #include #define G_GATE_CLASS_NAME "GATE" #define G_GATE_PROVIDER_NAME "ggate" #define G_GATE_MOD_NAME "ggate" #define G_GATE_CTL_NAME "ggctl" #define G_GATE_VERSION 3 /* * Maximum number of request that can be stored in * the queue when there are no workers. */ #define G_GATE_MAX_QUEUE_SIZE 4096 #define G_GATE_FLAG_READONLY 0x0001 #define G_GATE_FLAG_WRITEONLY 0x0002 #define G_GATE_FLAG_DESTROY 0x1000 #define G_GATE_USERFLAGS (G_GATE_FLAG_READONLY | G_GATE_FLAG_WRITEONLY) /* * Pick unit number automatically in /dev/ggate. */ #define G_GATE_UNIT_AUTO (-1) /* * Full provider name is given, so don't use ggate. */ #define G_GATE_NAME_GIVEN (-2) #define G_GATE_CMD_CREATE _IOWR('m', 0, struct g_gate_ctl_create) #define G_GATE_CMD_MODIFY _IOWR('m', 1, struct g_gate_ctl_modify) #define G_GATE_CMD_DESTROY _IOWR('m', 2, struct g_gate_ctl_destroy) #define G_GATE_CMD_CANCEL _IOWR('m', 3, struct g_gate_ctl_cancel) #define G_GATE_CMD_START _IOWR('m', 4, struct g_gate_ctl_io) #define G_GATE_CMD_DONE _IOWR('m', 5, struct g_gate_ctl_io) #define G_GATE_INFOSIZE 2048 #ifdef _KERNEL /* * 'P:' means 'Protected by'. */ struct g_gate_softc { char *sc_name; /* P: (read-only) */ int sc_unit; /* P: (read-only) */ int sc_ref; /* P: g_gate_list_mtx */ struct g_provider *sc_provider; /* P: (read-only) */ uint32_t sc_flags; /* P: sc_queue_mtx */ struct bio_queue_head sc_inqueue; /* P: sc_queue_mtx */ struct bio_queue_head sc_outqueue; /* P: sc_queue_mtx */ struct mtx sc_queue_mtx; uint32_t sc_queue_count; /* P: sc_queue_mtx */ uint32_t sc_queue_size; /* P: (read-only) */ u_int sc_timeout; /* P: (read-only) */ struct g_consumer *sc_readcons; /* P: XXX */ off_t sc_readoffset; /* P: XXX */ struct callout sc_callout; /* P: (modified only from callout thread) */ uintptr_t sc_seq; /* P: (modified only from g_down thread) */ LIST_ENTRY(g_gate_softc) sc_next; /* P: g_gate_list_mtx */ char sc_info[G_GATE_INFOSIZE]; /* P: (read-only) */ }; #define G_GATE_DEBUG(lvl, ...) do { \ if (g_gate_debug >= (lvl)) { \ printf("GEOM_GATE"); \ if (g_gate_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_GATE_LOGREQ(lvl, bp, ...) do { \ if (g_gate_debug >= (lvl)) { \ printf("GEOM_GATE"); \ if (g_gate_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #endif /* !_KERNEL */ struct g_gate_ctl_create { u_int gctl_version; off_t gctl_mediasize; u_int gctl_sectorsize; u_int gctl_flags; u_int gctl_maxcount; u_int gctl_timeout; char gctl_name[NAME_MAX]; char gctl_info[G_GATE_INFOSIZE]; char gctl_readprov[NAME_MAX]; off_t gctl_readoffset; int gctl_unit; /* in/out */ }; #define GG_MODIFY_MEDIASIZE 0x01 #define GG_MODIFY_INFO 0x02 #define GG_MODIFY_READPROV 0x04 #define GG_MODIFY_READOFFSET 0x08 struct g_gate_ctl_modify { u_int gctl_version; int gctl_unit; uint32_t gctl_modify; off_t gctl_mediasize; char gctl_info[G_GATE_INFOSIZE]; char gctl_readprov[NAME_MAX]; off_t gctl_readoffset; }; struct g_gate_ctl_destroy { u_int gctl_version; int gctl_unit; int gctl_force; char gctl_name[NAME_MAX]; }; struct g_gate_ctl_cancel { u_int gctl_version; int gctl_unit; uintptr_t gctl_seq; char gctl_name[NAME_MAX]; }; struct g_gate_ctl_io { u_int gctl_version; int gctl_unit; uintptr_t gctl_seq; u_int gctl_cmd; off_t gctl_offset; off_t gctl_length; void *gctl_data; int gctl_error; }; #endif /* !_G_GATE_H_ */ Index: head/sys/geom/geom.h =================================================================== --- head/sys/geom/geom.h (revision 326269) +++ head/sys/geom/geom.h (revision 326270) @@ -1,429 +1,431 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_H_ #define _GEOM_GEOM_H_ #include #include #include #include #include #include #include struct g_class; struct g_geom; struct g_consumer; struct g_provider; struct g_stat; struct thread; struct bio; struct sbuf; struct gctl_req; struct g_configargs; struct disk_zone_args; typedef int g_config_t (struct g_configargs *ca); typedef void g_ctl_req_t (struct gctl_req *, struct g_class *cp, char const *verb); typedef int g_ctl_create_geom_t (struct gctl_req *, struct g_class *cp, struct g_provider *pp); typedef int g_ctl_destroy_geom_t (struct gctl_req *, struct g_class *cp, struct g_geom *gp); typedef int g_ctl_config_geom_t (struct gctl_req *, struct g_geom *gp, const char *verb); typedef void g_init_t (struct g_class *mp); typedef void g_fini_t (struct g_class *mp); typedef struct g_geom * g_taste_t (struct g_class *, struct g_provider *, int flags); typedef int g_ioctl_t(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td); #define G_TF_NORMAL 0 #define G_TF_INSIST 1 #define G_TF_TRANSPARENT 2 typedef int g_access_t (struct g_provider *, int, int, int); /* XXX: not sure about the thread arg */ typedef void g_orphan_t (struct g_consumer *); typedef void g_start_t (struct bio *); typedef void g_spoiled_t (struct g_consumer *); typedef void g_attrchanged_t (struct g_consumer *, const char *attr); typedef void g_provgone_t (struct g_provider *); typedef void g_dumpconf_t (struct sbuf *, const char *indent, struct g_geom *, struct g_consumer *, struct g_provider *); typedef void g_resize_t(struct g_consumer *cp); /* * The g_class structure describes a transformation class. In other words * all BSD disklabel handlers share one g_class, all MBR handlers share * one common g_class and so on. * Certain operations are instantiated on the class, most notably the * taste and config_geom functions. */ struct g_class { const char *name; u_int version; u_int spare0; g_taste_t *taste; g_config_t *config; g_ctl_req_t *ctlreq; g_init_t *init; g_fini_t *fini; g_ctl_destroy_geom_t *destroy_geom; /* * Default values for geom methods */ g_start_t *start; g_spoiled_t *spoiled; g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; g_ioctl_t *ioctl; g_provgone_t *providergone; g_resize_t *resize; void *spare1; void *spare2; /* * The remaining elements are private */ LIST_ENTRY(g_class) class; LIST_HEAD(,g_geom) geom; }; /* * The g_geom_alias is a list node for aliases for the geom name * for device node creation. */ struct g_geom_alias { LIST_ENTRY(g_geom_alias) ga_next; const char *ga_alias; }; #define G_VERSION_00 0x19950323 #define G_VERSION_01 0x20041207 /* add fflag to g_ioctl_t */ #define G_VERSION G_VERSION_01 /* * The g_geom is an instance of a g_class. */ struct g_geom { char *name; struct g_class *class; LIST_ENTRY(g_geom) geom; LIST_HEAD(,g_consumer) consumer; LIST_HEAD(,g_provider) provider; TAILQ_ENTRY(g_geom) geoms; /* XXX: better name */ int rank; g_start_t *start; g_spoiled_t *spoiled; g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; g_ioctl_t *ioctl; g_provgone_t *providergone; g_resize_t *resize; void *spare0; void *spare1; void *softc; unsigned flags; #define G_GEOM_WITHER 1 #define G_GEOM_VOLATILE_BIO 2 LIST_HEAD(,g_geom_alias) aliases; }; /* * The g_bioq is a queue of struct bio's. * XXX: possibly collection point for statistics. * XXX: should (possibly) be collapsed with sys/bio.h::bio_queue_head. */ struct g_bioq { TAILQ_HEAD(, bio) bio_queue; struct mtx bio_queue_lock; int bio_queue_length; }; /* * A g_consumer is an attachment point for a g_provider. One g_consumer * can only be attached to one g_provider, but multiple g_consumers * can be attached to one g_provider. */ struct g_consumer { struct g_geom *geom; LIST_ENTRY(g_consumer) consumer; struct g_provider *provider; LIST_ENTRY(g_consumer) consumers; /* XXX: better name */ int acr, acw, ace; int flags; #define G_CF_SPOILED 0x1 #define G_CF_ORPHAN 0x4 #define G_CF_DIRECT_SEND 0x10 #define G_CF_DIRECT_RECEIVE 0x20 struct devstat *stat; u_int nstart, nend; /* Two fields for the implementing class to use */ void *private; u_int index; }; /* * A g_provider is a "logical disk". */ struct g_provider { char *name; LIST_ENTRY(g_provider) provider; struct g_geom *geom; LIST_HEAD(,g_consumer) consumers; int acr, acw, ace; int error; TAILQ_ENTRY(g_provider) orphan; off_t mediasize; u_int sectorsize; u_int stripesize; u_int stripeoffset; struct devstat *stat; u_int nstart, nend; u_int flags; #define G_PF_WITHER 0x2 #define G_PF_ORPHAN 0x4 #define G_PF_ACCEPT_UNMAPPED 0x8 #define G_PF_DIRECT_SEND 0x10 #define G_PF_DIRECT_RECEIVE 0x20 /* Two fields for the implementing class to use */ void *private; u_int index; }; /* * Descriptor of a classifier. We can register a function and * an argument, which is called by g_io_request() on bio's * that are not previously classified. */ struct g_classifier_hook { TAILQ_ENTRY(g_classifier_hook) link; int (*func)(void *arg, struct bio *bp); void *arg; }; /* BIO_GETATTR("GEOM::setstate") argument values. */ #define G_STATE_FAILED 0 #define G_STATE_REBUILD 1 #define G_STATE_RESYNC 2 #define G_STATE_ACTIVE 3 /* geom_dev.c */ struct cdev; void g_dev_print(void); void g_dev_physpath_changed(void); struct g_provider *g_dev_getprovider(struct cdev *dev); /* geom_dump.c */ void g_trace(int level, const char *, ...); # define G_T_TOPOLOGY 1 # define G_T_BIO 2 # define G_T_ACCESS 4 /* geom_event.c */ typedef void g_event_t(void *, int flag); #define EV_CANCEL 1 int g_post_event(g_event_t *func, void *arg, int flag, ...); int g_waitfor_event(g_event_t *func, void *arg, int flag, ...); void g_cancel_event(void *ref); int g_attr_changed(struct g_provider *pp, const char *attr, int flag); int g_media_changed(struct g_provider *pp, int flag); int g_media_gone(struct g_provider *pp, int flag); void g_orphan_provider(struct g_provider *pp, int error); void g_waitidlelock(void); /* geom_subr.c */ int g_access(struct g_consumer *cp, int nread, int nwrite, int nexcl); int g_attach(struct g_consumer *cp, struct g_provider *pp); int g_compare_names(const char *namea, const char *nameb); void g_destroy_consumer(struct g_consumer *cp); void g_destroy_geom(struct g_geom *pp); void g_destroy_provider(struct g_provider *pp); void g_detach(struct g_consumer *cp); void g_error_provider(struct g_provider *pp, int error); struct g_provider *g_provider_by_name(char const *arg); void g_geom_add_alias(struct g_geom *gp, const char *alias); int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len); #define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof *(v)) int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len); int g_handleattr_int(struct bio *bp, const char *attribute, int val); int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val); int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val); int g_handleattr_str(struct bio *bp, const char *attribute, const char *str); struct g_consumer * g_new_consumer(struct g_geom *gp); struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) __printflike(2, 3); struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) __printflike(2, 3); void g_resize_provider(struct g_provider *pp, off_t size); int g_retaste(struct g_class *mp); void g_spoil(struct g_provider *pp, struct g_consumer *cp); int g_std_access(struct g_provider *pp, int dr, int dw, int de); void g_std_done(struct bio *bp); void g_std_spoiled(struct g_consumer *cp); void g_wither_geom(struct g_geom *gp, int error); void g_wither_geom_close(struct g_geom *gp, int error); void g_wither_provider(struct g_provider *pp, int error); #if defined(DIAGNOSTIC) || defined(DDB) int g_valid_obj(void const *ptr); #endif #ifdef DIAGNOSTIC #define G_VALID_CLASS(foo) \ KASSERT(g_valid_obj(foo) == 1, ("%p is not a g_class", foo)) #define G_VALID_GEOM(foo) \ KASSERT(g_valid_obj(foo) == 2, ("%p is not a g_geom", foo)) #define G_VALID_CONSUMER(foo) \ KASSERT(g_valid_obj(foo) == 3, ("%p is not a g_consumer", foo)) #define G_VALID_PROVIDER(foo) \ KASSERT(g_valid_obj(foo) == 4, ("%p is not a g_provider", foo)) #else #define G_VALID_CLASS(foo) do { } while (0) #define G_VALID_GEOM(foo) do { } while (0) #define G_VALID_CONSUMER(foo) do { } while (0) #define G_VALID_PROVIDER(foo) do { } while (0) #endif int g_modevent(module_t, int, void *); /* geom_io.c */ struct bio * g_clone_bio(struct bio *); struct bio * g_duplicate_bio(struct bio *); void g_destroy_bio(struct bio *); void g_io_deliver(struct bio *bp, int error); int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp); int g_io_flush(struct g_consumer *cp); int g_register_classifier(struct g_classifier_hook *hook); void g_unregister_classifier(struct g_classifier_hook *hook); void g_io_request(struct bio *bp, struct g_consumer *cp); struct bio *g_new_bio(void); struct bio *g_alloc_bio(void); void g_reset_bio(struct bio *); void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error); int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length); int g_delete_data(struct g_consumer *cp, off_t offset, off_t length); void g_print_bio(struct bio *bp); /* geom_kern.c / geom_kernsim.c */ #ifdef _KERNEL extern struct sx topology_lock; struct g_kerneldump { off_t offset; off_t length; struct dumperinfo di; }; MALLOC_DECLARE(M_GEOM); static __inline void * g_malloc(int size, int flags) { void *p; p = malloc(size, M_GEOM, flags); return (p); } static __inline void g_free(void *ptr) { #ifdef DIAGNOSTIC if (sx_xlocked(&topology_lock)) { KASSERT(g_valid_obj(ptr) == 0, ("g_free(%p) of live object, type %d", ptr, g_valid_obj(ptr))); } #endif free(ptr, M_GEOM); } #define g_topology_lock() \ do { \ sx_xlock(&topology_lock); \ } while (0) #define g_topology_try_lock() sx_try_xlock(&topology_lock) #define g_topology_unlock() \ do { \ sx_xunlock(&topology_lock); \ } while (0) #define g_topology_assert() \ do { \ sx_assert(&topology_lock, SX_XLOCKED); \ } while (0) #define g_topology_assert_not() \ do { \ sx_assert(&topology_lock, SX_UNLOCKED); \ } while (0) #define g_topology_sleep(chan, timo) \ sx_sleep(chan, &topology_lock, 0, "gtopol", timo) #define DECLARE_GEOM_CLASS(class, name) \ static moduledata_t name##_mod = { \ #name, g_modevent, &class \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); int g_is_geom_thread(struct thread *td); #endif /* _KERNEL */ /* geom_ctl.c */ int gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len); void gctl_set_param_err(struct gctl_req *req, const char *param, void const *ptr, int len); void *gctl_get_param(struct gctl_req *req, const char *param, int *len); char const *gctl_get_asciiparam(struct gctl_req *req, const char *param); void *gctl_get_paraml(struct gctl_req *req, const char *param, int len); int gctl_error(struct gctl_req *req, const char *fmt, ...) __printflike(2, 3); struct g_class *gctl_get_class(struct gctl_req *req, char const *arg); struct g_geom *gctl_get_geom(struct gctl_req *req, struct g_class *mpr, char const *arg); struct g_provider *gctl_get_provider(struct gctl_req *req, char const *arg); #endif /* _GEOM_GEOM_H_ */ Index: head/sys/geom/geom_aes.c =================================================================== --- head/sys/geom/geom_aes.c (revision 326269) +++ head/sys/geom/geom_aes.c (revision 326270) @@ -1,373 +1,375 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This method provides AES encryption with a compiled in key (default * all zeroes). * * XXX: This could probably save a lot of code by pretending to be a slicer. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define AES_CLASS_NAME "AES" #define MASTER_KEY_LENGTH (1024/8) static const u_char *aes_magic = "<>"; static const u_char *aes_magic_random = "<>"; static const u_char *aes_magic_test = "<>"; struct g_aes_softc { enum { KEY_ZERO, KEY_RANDOM, KEY_TEST } keying; u_int sectorsize; off_t mediasize; cipherInstance ci; u_char master_key[MASTER_KEY_LENGTH]; }; /* * Generate a sectorkey from the masterkey and the offset position. * * For KEY_ZERO we just return a key of all zeros. * * We feed the sector byte offset, 16 bytes of the master-key and * the sector byte offset once more to MD5. * The sector byte offset is converted to little-endian format first * to support multi-architecture operation. * We use 16 bytes from the master-key starting at the logical sector * number modulus he length of the master-key. If need be we wrap * around to the start of the master-key. */ static void g_aes_makekey(struct g_aes_softc *sc, off_t off, keyInstance *ki, int dir) { MD5_CTX cx; u_int64_t u64; u_int u, u1; u_char *p, buf[16]; if (sc->keying == KEY_ZERO) { rijndael_makeKey(ki, dir, 128, sc->master_key); return; } MD5Init(&cx); u64 = htole64(off); MD5Update(&cx, (u_char *)&u64, sizeof(u64)); u = off / sc->sectorsize; u %= sizeof sc->master_key; p = sc->master_key + u; if (u + 16 <= sizeof(sc->master_key)) { MD5Update(&cx, p, 16); } else { u1 = sizeof sc->master_key - u; MD5Update(&cx, p, u1); MD5Update(&cx, sc->master_key, 16 - u1); u1 = 0; /* destroy evidence */ } u = 0; /* destroy evidence */ MD5Update(&cx, (u_char *)&u64, sizeof(u64)); u64 = 0; /* destroy evidence */ MD5Final(buf, &cx); bzero(&cx, sizeof cx); /* destroy evidence */ rijndael_makeKey(ki, dir, 128, buf); bzero(buf, sizeof buf); /* destroy evidence */ } static void g_aes_read_done(struct bio *bp) { struct g_geom *gp; struct g_aes_softc *sc; u_char *p, *b, *e, *sb; keyInstance dkey; off_t o; gp = bp->bio_from->geom; sc = gp->softc; sb = g_malloc(sc->sectorsize, M_WAITOK); b = bp->bio_data; e = bp->bio_data; e += bp->bio_length; o = bp->bio_offset - sc->sectorsize; for (p = b; p < e; p += sc->sectorsize) { g_aes_makekey(sc, o, &dkey, DIR_DECRYPT); rijndael_blockDecrypt(&sc->ci, &dkey, p, sc->sectorsize * 8, sb); bcopy(sb, p, sc->sectorsize); o += sc->sectorsize; } bzero(&dkey, sizeof dkey); /* destroy evidence */ bzero(sb, sc->sectorsize); /* destroy evidence */ g_free(sb); g_std_done(bp); } static void g_aes_write_done(struct bio *bp) { bzero(bp->bio_data, bp->bio_length); /* destroy evidence */ g_free(bp->bio_data); g_std_done(bp); } static void g_aes_start(struct bio *bp) { struct g_geom *gp; struct g_consumer *cp; struct g_aes_softc *sc; struct bio *bp2; u_char *p1, *p2, *b, *e; keyInstance ekey; off_t o; gp = bp->bio_to->geom; cp = LIST_FIRST(&gp->consumer); sc = gp->softc; switch (bp->bio_cmd) { case BIO_READ: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_aes_read_done; bp2->bio_offset += sc->sectorsize; g_io_request(bp2, cp); break; case BIO_WRITE: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_aes_write_done; bp2->bio_offset += sc->sectorsize; bp2->bio_data = g_malloc(bp->bio_length, M_WAITOK); b = bp->bio_data; e = bp->bio_data; e += bp->bio_length; p2 = bp2->bio_data; o = bp->bio_offset; for (p1 = b; p1 < e; p1 += sc->sectorsize) { g_aes_makekey(sc, o, &ekey, DIR_ENCRYPT); rijndael_blockEncrypt(&sc->ci, &ekey, p1, sc->sectorsize * 8, p2); p2 += sc->sectorsize; o += sc->sectorsize; } bzero(&ekey, sizeof ekey); /* destroy evidence */ g_io_request(bp2, cp); break; case BIO_GETATTR: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_std_done; bp2->bio_offset += sc->sectorsize; g_io_request(bp2, cp); break; default: g_io_deliver(bp, EOPNOTSUPP); return; } return; } static void g_aes_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_aes_softc *sc; g_trace(G_T_TOPOLOGY, "g_aes_orphan(%p/%s)", cp, cp->provider->name); g_topology_assert(); gp = cp->geom; sc = gp->softc; g_wither_geom(gp, ENXIO); bzero(sc, sizeof(struct g_aes_softc)); /* destroy evidence */ g_free(sc); return; } static int g_aes_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); /* On first open, grab an extra "exclusive" bit */ if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) de++; /* ... and let go of it on last close */ if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) de--; return (g_access(cp, dr, dw, de)); } static struct g_geom * g_aes_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp; struct g_consumer *cp; struct g_aes_softc *sc; int error; u_int sectorsize; off_t mediasize; u_char *buf; g_trace(G_T_TOPOLOGY, "aes_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_new_geomf(mp, "%s.aes", pp->name); cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, 0, 0); if (error) { g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } buf = NULL; g_topology_unlock(); do { if (gp->rank != 2) break; sectorsize = cp->provider->sectorsize; mediasize = cp->provider->mediasize; buf = g_read_data(cp, 0, sectorsize, NULL); if (buf == NULL) { break; } sc = g_malloc(sizeof(struct g_aes_softc), M_WAITOK | M_ZERO); if (!memcmp(buf, aes_magic, strlen(aes_magic))) { sc->keying = KEY_ZERO; } else if (!memcmp(buf, aes_magic_random, strlen(aes_magic_random))) { sc->keying = KEY_RANDOM; } else if (!memcmp(buf, aes_magic_test, strlen(aes_magic_test))) { sc->keying = KEY_TEST; } else { g_free(sc); break; } g_free(buf); gp->softc = sc; sc->sectorsize = sectorsize; sc->mediasize = mediasize - sectorsize; rijndael_cipherInit(&sc->ci, MODE_CBC, NULL); if (sc->keying == KEY_TEST) { int i; u_char *p; p = sc->master_key; for (i = 0; i < (int)sizeof sc->master_key; i ++) *p++ = i; } if (sc->keying == KEY_RANDOM) { int i; u_int32_t u; u_char *p; p = sc->master_key; for (i = 0; i < (int)sizeof sc->master_key; i += sizeof u) { u = arc4random(); *p++ = u; *p++ = u >> 8; *p++ = u >> 16; *p++ = u >> 24; } } g_topology_lock(); pp = g_new_providerf(gp, "%s", gp->name); pp->mediasize = mediasize - sectorsize; pp->sectorsize = sectorsize; g_error_provider(pp, 0); g_topology_unlock(); } while(0); g_topology_lock(); if (buf) g_free(buf); g_access(cp, -1, 0, 0); if (gp->softc != NULL) return (gp); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static struct g_class g_aes_class = { .name = AES_CLASS_NAME, .version = G_VERSION, .taste = g_aes_taste, .start = g_aes_start, .orphan = g_aes_orphan, .spoiled = g_std_spoiled, .access = g_aes_access, }; DECLARE_GEOM_CLASS(g_aes_class, g_aes); Index: head/sys/geom/geom_bsd.c =================================================================== --- head/sys/geom/geom_bsd.c (revision 326269) +++ head/sys/geom/geom_bsd.c (revision 326270) @@ -1,614 +1,616 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This is the method for dealing with BSD disklabels. It has been * extensively (by my standards at least) commented, in the vain hope that * it will serve as the source in future copy&paste operations. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_bsd, "GEOM BSD disklabels support"); #define BSD_CLASS_NAME "BSD" #define ALPHA_LABEL_OFFSET 64 #define HISTORIC_LABEL_OFFSET 512 #define LABELSIZE (148 + 16 * MAXPARTITIONS) static int g_bsd_once; static void g_bsd_hotwrite(void *arg, int flag); /* * Our private data about one instance. All the rest is handled by the * slice code and stored in its softc, so this is just the stuff * specific to BSD disklabels. */ struct g_bsd_softc { off_t labeloffset; off_t mbroffset; off_t rawoffset; struct disklabel ondisk; u_char label[LABELSIZE]; u_char labelsum[16]; }; /* * Modify our slicer to match proposed disklabel, if possible. * This is where we make sure we don't do something stupid. */ static int g_bsd_modify(struct g_geom *gp, u_char *label) { int i, error; struct partition *ppp; struct g_slicer *gsp; struct g_consumer *cp; struct g_bsd_softc *ms; u_int secsize, u; off_t rawoffset, o; struct disklabel dl; MD5_CTX md5sum; g_topology_assert(); gsp = gp->softc; ms = gsp->softc; error = bsd_disklabel_le_dec(label, &dl, MAXPARTITIONS); if (error) { return (error); } /* Get dimensions of our device. */ cp = LIST_FIRST(&gp->consumer); secsize = cp->provider->sectorsize; /* ... or a smaller sector size. */ if (dl.d_secsize < secsize) { return (EINVAL); } /* ... or a non-multiple sector size. */ if (dl.d_secsize % secsize != 0) { return (EINVAL); } /* Historical braindamage... */ rawoffset = (off_t)dl.d_partitions[RAW_PART].p_offset * dl.d_secsize; for (i = 0; i < dl.d_npartitions; i++) { ppp = &dl.d_partitions[i]; if (ppp->p_size == 0) continue; o = (off_t)ppp->p_offset * dl.d_secsize; if (o < rawoffset) rawoffset = 0; } if (rawoffset != 0 && (off_t)rawoffset != ms->mbroffset) printf("WARNING: %s expected rawoffset %jd, found %jd\n", gp->name, (intmax_t)ms->mbroffset/dl.d_secsize, (intmax_t)rawoffset/dl.d_secsize); /* Don't munge open partitions. */ for (i = 0; i < dl.d_npartitions; i++) { ppp = &dl.d_partitions[i]; o = (off_t)ppp->p_offset * dl.d_secsize; if (o == 0) o = rawoffset; error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, o - rawoffset, (off_t)ppp->p_size * dl.d_secsize, dl.d_secsize, "%s%c", gp->name, 'a' + i); if (error) return (error); } /* Look good, go for it... */ for (u = 0; u < gsp->nslice; u++) { ppp = &dl.d_partitions[u]; o = (off_t)ppp->p_offset * dl.d_secsize; if (o == 0) o = rawoffset; g_slice_config(gp, u, G_SLICE_CONFIG_SET, o - rawoffset, (off_t)ppp->p_size * dl.d_secsize, dl.d_secsize, "%s%c", gp->name, 'a' + u); } /* Update our softc */ ms->ondisk = dl; if (label != ms->label) bcopy(label, ms->label, LABELSIZE); ms->rawoffset = rawoffset; /* * In order to avoid recursively attaching to the same * on-disk label (it's usually visible through the 'c' * partition) we calculate an MD5 and ask if other BSD's * below us love that label. If they do, we don't. */ MD5Init(&md5sum); MD5Update(&md5sum, ms->label, sizeof(ms->label)); MD5Final(ms->labelsum, &md5sum); return (0); } /* * This is an internal helper function, called multiple times from the taste * function to try to locate a disklabel on the disk. More civilized formats * will not need this, as there is only one possible place on disk to look * for the magic spot. */ static int g_bsd_try(struct g_geom *gp, struct g_slicer *gsp, struct g_consumer *cp, int secsize, struct g_bsd_softc *ms, off_t offset) { int error; u_char *buf; struct disklabel *dl; off_t secoff; /* * We need to read entire aligned sectors, and we assume that the * disklabel does not span sectors, so one sector is enough. */ secoff = offset % secsize; buf = g_read_data(cp, offset - secoff, secsize, NULL); if (buf == NULL) return (ENOENT); /* Decode into our native format. */ dl = &ms->ondisk; error = bsd_disklabel_le_dec(buf + secoff, dl, MAXPARTITIONS); if (!error) bcopy(buf + secoff, ms->label, LABELSIZE); /* Remember to free the buffer g_read_data() gave us. */ g_free(buf); ms->labeloffset = offset; return (error); } /* * This function writes the current label to disk, possibly updating * the alpha SRM checksum. */ static int g_bsd_writelabel(struct g_geom *gp, u_char *bootcode) { off_t secoff; u_int secsize; struct g_consumer *cp; struct g_slicer *gsp; struct g_bsd_softc *ms; u_char *buf; uint64_t sum; int error, i; gsp = gp->softc; ms = gsp->softc; cp = LIST_FIRST(&gp->consumer); /* Get sector size, we need it to read data. */ secsize = cp->provider->sectorsize; secoff = ms->labeloffset % secsize; if (bootcode == NULL) { buf = g_read_data(cp, ms->labeloffset - secoff, secsize, &error); if (buf == NULL) return (error); bcopy(ms->label, buf + secoff, sizeof(ms->label)); } else { buf = bootcode; bcopy(ms->label, buf + ms->labeloffset, sizeof(ms->label)); } if (ms->labeloffset == ALPHA_LABEL_OFFSET) { sum = 0; for (i = 0; i < 63; i++) sum += le64dec(buf + i * 8); le64enc(buf + 504, sum); } if (bootcode == NULL) { error = g_write_data(cp, ms->labeloffset - secoff, buf, secsize); g_free(buf); } else { error = g_write_data(cp, 0, bootcode, BBSIZE); } return(error); } /* * If the user tries to overwrite our disklabel through an open partition * or via a magicwrite config call, we end up here and try to prevent * footshooting as best we can. */ static void g_bsd_hotwrite(void *arg, int flag) { struct bio *bp; struct g_geom *gp; struct g_slicer *gsp; struct g_slice *gsl; struct g_bsd_softc *ms; u_char *p; int error; g_topology_assert(); /* * We should never get canceled, because that would amount to a removal * of the geom while there was outstanding I/O requests. */ KASSERT(flag != EV_CANCEL, ("g_bsd_hotwrite cancelled")); bp = arg; gp = bp->bio_to->geom; gsp = gp->softc; ms = gsp->softc; gsl = &gsp->slices[bp->bio_to->index]; p = (u_char*)bp->bio_data + ms->labeloffset - (bp->bio_offset + gsl->offset); error = g_bsd_modify(gp, p); if (error) { g_io_deliver(bp, EPERM); return; } g_slice_finish_hot(bp); } static int g_bsd_start(struct bio *bp) { struct g_geom *gp; struct g_bsd_softc *ms; struct g_slicer *gsp; gp = bp->bio_to->geom; gsp = gp->softc; ms = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr(bp, "BSD::labelsum", ms->labelsum, sizeof(ms->labelsum))) return (1); } return (0); } /* * Dump configuration information in XML format. * Notice that the function is called once for the geom and once for each * consumer and provider. We let g_slice_dumpconf() do most of the work. */ static void g_bsd_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_bsd_softc *ms; struct g_slicer *gsp; gsp = gp->softc; ms = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (indent != NULL && pp == NULL && cp == NULL) { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)ms->labeloffset); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)ms->rawoffset); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)ms->mbroffset); } else if (pp != NULL) { if (indent == NULL) sbuf_printf(sb, " ty %d", ms->ondisk.d_partitions[pp->index].p_fstype); else sbuf_printf(sb, "%s%d\n", indent, ms->ondisk.d_partitions[pp->index].p_fstype); } } /* * The taste function is called from the event-handler, with the topology * lock already held and a provider to examine. The flags are unused. * * If flags == G_TF_NORMAL, the idea is to take a bite of the provider and * if we find valid, consistent magic on it, build a geom on it. * * There may be cases where the operator would like to put a BSD-geom on * providers which do not meet all of the requirements. This can be done * by instead passing the G_TF_INSIST flag, which will override these * checks. * * The final flags value is G_TF_TRANSPARENT, which instructs the method * to put a geom on top of the provider and configure it to be as transparent * as possible. This is not really relevant to the BSD method and therefore * not implemented here. */ static struct uuid freebsd_slice = GPT_ENT_TYPE_FREEBSD; static struct g_geom * g_bsd_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; int error, i; struct g_bsd_softc *ms; u_int secsize; struct g_slicer *gsp; u_char hash[16]; MD5_CTX md5sum; struct uuid uuid; g_trace(G_T_TOPOLOGY, "bsd_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); /* We don't implement transparent inserts. */ if (flags == G_TF_TRANSPARENT) return (NULL); /* * BSD labels are a subclass of the general "slicing" topology so * a lot of the work can be done by the common "slice" code. * Create a geom with space for MAXPARTITIONS providers, one consumer * and a softc structure for us. Specify the provider to attach * the consumer to and our "start" routine for special requests. * The provider is opened with mode (1,0,0) so we can do reads * from it. */ gp = g_slice_new(mp, MAXPARTITIONS, pp, &cp, &ms, sizeof(*ms), g_bsd_start); if (gp == NULL) return (NULL); /* Get the geom_slicer softc from the geom. */ gsp = gp->softc; /* * The do...while loop here allows us to have multiple escapes * using a simple "break". This improves code clarity without * ending up in deep nesting and without using goto or come from. */ do { /* * If the provider is an MBR we will only auto attach * to type 165 slices in the G_TF_NORMAL case. We will * attach to any other type. */ error = g_getattr("MBR::type", cp, &i); if (!error) { if (i != 165 && flags == G_TF_NORMAL) break; error = g_getattr("MBR::offset", cp, &ms->mbroffset); if (error) break; } /* Same thing if we are inside a GPT */ error = g_getattr("GPT::type", cp, &uuid); if (!error) { if (memcmp(&uuid, &freebsd_slice, sizeof(uuid)) != 0 && flags == G_TF_NORMAL) break; } /* Get sector size, we need it to read data. */ secsize = cp->provider->sectorsize; if (secsize < 512) break; /* First look for a label at the start of the second sector. */ error = g_bsd_try(gp, gsp, cp, secsize, ms, secsize); /* * If sector size is not 512 the label still can be at * offset 512, not at the start of the second sector. At least * it's true for labels created by the FreeBSD's bsdlabel(8). */ if (error && secsize != HISTORIC_LABEL_OFFSET) error = g_bsd_try(gp, gsp, cp, secsize, ms, HISTORIC_LABEL_OFFSET); /* Next, look for alpha labels */ if (error) error = g_bsd_try(gp, gsp, cp, secsize, ms, ALPHA_LABEL_OFFSET); /* If we didn't find a label, punt. */ if (error) break; /* * In order to avoid recursively attaching to the same * on-disk label (it's usually visible through the 'c' * partition) we calculate an MD5 and ask if other BSD's * below us love that label. If they do, we don't. */ MD5Init(&md5sum); MD5Update(&md5sum, ms->label, sizeof(ms->label)); MD5Final(ms->labelsum, &md5sum); error = g_getattr("BSD::labelsum", cp, &hash); if (!error && !bcmp(ms->labelsum, hash, sizeof(hash))) break; /* * Process the found disklabel, and modify our "slice" * instance to match it, if possible. */ error = g_bsd_modify(gp, ms->label); } while (0); /* Success or failure, we can close our provider now. */ g_access(cp, -1, 0, 0); /* If we have configured any providers, return the new geom. */ if (gsp->nprovider > 0) { g_slice_conf_hot(gp, 0, ms->labeloffset, LABELSIZE, G_SLICE_HOT_ALLOW, G_SLICE_HOT_DENY, G_SLICE_HOT_CALL); gsp->hot = g_bsd_hotwrite; if (!g_bsd_once) { g_bsd_once = 1; printf( "WARNING: geom_bsd (geom %s) is deprecated, " "use gpart instead.\n", gp->name); } return (gp); } /* * ...else push the "self-destruct" button, by spoiling our own * consumer. This triggers a call to g_slice_spoiled which will * dismantle what was setup. */ g_slice_spoiled(cp); return (NULL); } struct h0h0 { struct g_geom *gp; struct g_bsd_softc *ms; u_char *label; int error; }; static void g_bsd_callconfig(void *arg, int flag) { struct h0h0 *hp; hp = arg; hp->error = g_bsd_modify(hp->gp, hp->label); if (!hp->error) hp->error = g_bsd_writelabel(hp->gp, NULL); } /* * NB! curthread is user process which GCTL'ed. */ static void g_bsd_config(struct gctl_req *req, struct g_class *mp, char const *verb) { u_char *label; int error; struct h0h0 h0h0; struct g_geom *gp; struct g_slicer *gsp; struct g_consumer *cp; struct g_bsd_softc *ms; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; cp = LIST_FIRST(&gp->consumer); gsp = gp->softc; ms = gsp->softc; if (!strcmp(verb, "read mbroffset")) { gctl_set_param_err(req, "mbroffset", &ms->mbroffset, sizeof(ms->mbroffset)); return; } else if (!strcmp(verb, "write label")) { label = gctl_get_paraml(req, "label", LABELSIZE); if (label == NULL) return; h0h0.gp = gp; h0h0.ms = gsp->softc; h0h0.label = label; h0h0.error = -1; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } g_bsd_callconfig(&h0h0, 0); error = h0h0.error; g_access(cp, -1, -1, -1); } else if (!strcmp(verb, "write bootcode")) { label = gctl_get_paraml(req, "bootcode", BBSIZE); if (label == NULL) return; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } error = g_bsd_writelabel(gp, label); g_access(cp, -1, -1, -1); } else { gctl_error(req, "Unknown verb parameter"); } return; } /* Finally, register with GEOM infrastructure. */ static struct g_class g_bsd_class = { .name = BSD_CLASS_NAME, .version = G_VERSION, .taste = g_bsd_taste, .ctlreq = g_bsd_config, .dumpconf = g_bsd_dumpconf, }; DECLARE_GEOM_CLASS(g_bsd_class, g_bsd); Index: head/sys/geom/geom_bsd_enc.c =================================================================== --- head/sys/geom/geom_bsd_enc.c (revision 326269) +++ head/sys/geom/geom_bsd_enc.c (revision 326270) @@ -1,196 +1,198 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Functions to encode and decode struct disklabel and struct partition into * a bytestream of little endianness and correct packing. * * NB! This file must be usable both in kernel and userland. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #ifdef _KERNEL #include #else #include #endif void bsd_partition_le_dec(u_char *ptr, struct partition *d) { d->p_size = le32dec(ptr + 0); d->p_offset = le32dec(ptr + 4); d->p_fsize = le32dec(ptr + 8); d->p_fstype = ptr[12]; d->p_frag = ptr[13]; d->p_cpg = le16dec(ptr + 14); } int bsd_disklabel_le_dec(u_char *ptr, struct disklabel *d, int maxpart) { int i; u_char *p, *pe; uint16_t sum; d->d_magic = le32dec(ptr + 0); if (d->d_magic != DISKMAGIC) return(EINVAL); d->d_magic2 = le32dec(ptr + 132); if (d->d_magic2 != DISKMAGIC) { return(EINVAL); } d->d_npartitions = le16dec(ptr + 138); if (d->d_npartitions > maxpart) { return(EINVAL); } pe = ptr + 148 + 16 * d->d_npartitions; sum = 0; for (p = ptr; p < pe; p += 2) sum ^= le16dec(p); if (sum != 0) { return(EINVAL); } d->d_type = le16dec(ptr + 4); d->d_subtype = le16dec(ptr + 6); bcopy(ptr + 8, d->d_typename, 16); bcopy(ptr + 24, d->d_packname, 16); d->d_secsize = le32dec(ptr + 40); d->d_nsectors = le32dec(ptr + 44); d->d_ntracks = le32dec(ptr + 48); d->d_ncylinders = le32dec(ptr + 52); d->d_secpercyl = le32dec(ptr + 56); d->d_secperunit = le32dec(ptr + 60); d->d_sparespertrack = le16dec(ptr + 64); d->d_sparespercyl = le16dec(ptr + 66); d->d_acylinders = le32dec(ptr + 68); d->d_rpm = le16dec(ptr + 72); d->d_interleave = le16dec(ptr + 74); d->d_trackskew = le16dec(ptr + 76); d->d_cylskew = le16dec(ptr + 78); d->d_headswitch = le32dec(ptr + 80); d->d_trkseek = le32dec(ptr + 84); d->d_flags = le32dec(ptr + 88); d->d_drivedata[0] = le32dec(ptr + 92); d->d_drivedata[1] = le32dec(ptr + 96); d->d_drivedata[2] = le32dec(ptr + 100); d->d_drivedata[3] = le32dec(ptr + 104); d->d_drivedata[4] = le32dec(ptr + 108); d->d_spare[0] = le32dec(ptr + 112); d->d_spare[1] = le32dec(ptr + 116); d->d_spare[2] = le32dec(ptr + 120); d->d_spare[3] = le32dec(ptr + 124); d->d_spare[4] = le32dec(ptr + 128); d->d_checksum = le16dec(ptr + 136); d->d_npartitions = le16dec(ptr + 138); d->d_bbsize = le32dec(ptr + 140); d->d_sbsize = le32dec(ptr + 144); for (i = 0; i < d->d_npartitions; i++) bsd_partition_le_dec(ptr + 148 + 16 * i, &d->d_partitions[i]); return(0); } void bsd_partition_le_enc(u_char *ptr, struct partition *d) { le32enc(ptr + 0, d->p_size); le32enc(ptr + 4, d->p_offset); le32enc(ptr + 8, d->p_fsize); ptr[12] = d->p_fstype; ptr[13] = d->p_frag; le16enc(ptr + 14, d->p_cpg); } void bsd_disklabel_le_enc(u_char *ptr, struct disklabel *d) { int i; u_char *p, *pe; uint16_t sum; le32enc(ptr + 0, d->d_magic); le16enc(ptr + 4, d->d_type); le16enc(ptr + 6, d->d_subtype); bcopy(d->d_typename, ptr + 8, 16); bcopy(d->d_packname, ptr + 24, 16); le32enc(ptr + 40, d->d_secsize); le32enc(ptr + 44, d->d_nsectors); le32enc(ptr + 48, d->d_ntracks); le32enc(ptr + 52, d->d_ncylinders); le32enc(ptr + 56, d->d_secpercyl); le32enc(ptr + 60, d->d_secperunit); le16enc(ptr + 64, d->d_sparespertrack); le16enc(ptr + 66, d->d_sparespercyl); le32enc(ptr + 68, d->d_acylinders); le16enc(ptr + 72, d->d_rpm); le16enc(ptr + 74, d->d_interleave); le16enc(ptr + 76, d->d_trackskew); le16enc(ptr + 78, d->d_cylskew); le32enc(ptr + 80, d->d_headswitch); le32enc(ptr + 84, d->d_trkseek); le32enc(ptr + 88, d->d_flags); le32enc(ptr + 92, d->d_drivedata[0]); le32enc(ptr + 96, d->d_drivedata[1]); le32enc(ptr + 100, d->d_drivedata[2]); le32enc(ptr + 104, d->d_drivedata[3]); le32enc(ptr + 108, d->d_drivedata[4]); le32enc(ptr + 112, d->d_spare[0]); le32enc(ptr + 116, d->d_spare[1]); le32enc(ptr + 120, d->d_spare[2]); le32enc(ptr + 124, d->d_spare[3]); le32enc(ptr + 128, d->d_spare[4]); le32enc(ptr + 132, d->d_magic2); le16enc(ptr + 136, 0); le16enc(ptr + 138, d->d_npartitions); le32enc(ptr + 140, d->d_bbsize); le32enc(ptr + 144, d->d_sbsize); for (i = 0; i < d->d_npartitions; i++) bsd_partition_le_enc(ptr + 148 + 16 * i, &d->d_partitions[i]); pe = ptr + 148 + 16 * d->d_npartitions; sum = 0; for (p = ptr; p < pe; p += 2) sum ^= le16dec(p); le16enc(ptr + 136, sum); } Index: head/sys/geom/geom_ccd.c =================================================================== --- head/sys/geom/geom_ccd.c (revision 326269) +++ head/sys/geom/geom_ccd.c (revision 326270) @@ -1,908 +1,910 @@ /*- + * SPDX-License-Identifier: BSD-4-Clause + * * Copyright (c) 2003 Poul-Henning Kamp. * Copyright (c) 1995 Jason R. Thorpe. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * All rights reserved. * Copyright (c) 1988 University of Utah. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project * by Jason R. Thorpe. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Dynamic configuration and disklabel support by: * Jason R. Thorpe * Numerical Aerodynamic Simulation Facility * Mail Stop 258-6 * NASA Ames Research Center * Moffett Field, CA 94035 * * from: Utah $Hdr: cd.c 1.6 90/11/28$ * @(#)cd.c 8.2 (Berkeley) 11/16/93 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* * Number of blocks to untouched in front of a component partition. * This is to avoid violating its disklabel area when it starts at the * beginning of the slice. */ #if !defined(CCD_OFFSET) #define CCD_OFFSET 16 #endif /* sc_flags */ #define CCDF_UNIFORM 0x02 /* use LCCD of sizes for uniform interleave */ #define CCDF_MIRROR 0x04 /* use mirroring */ #define CCDF_NO_OFFSET 0x08 /* do not leave space in front */ #define CCDF_LINUX 0x10 /* use Linux compatibility mode */ /* Mask of user-settable ccd flags. */ #define CCDF_USERMASK (CCDF_UNIFORM|CCDF_MIRROR) /* * Interleave description table. * Computed at boot time to speed irregular-interleave lookups. * The idea is that we interleave in "groups". First we interleave * evenly over all component disks up to the size of the smallest * component (the first group), then we interleave evenly over all * remaining disks up to the size of the next-smallest (second group), * and so on. * * Each table entry describes the interleave characteristics of one * of these groups. For example if a concatenated disk consisted of * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at * DEV_BSIZE (1), the table would have three entries: * * ndisk startblk startoff dev * 3 0 0 0, 1, 2 * 2 9 3 0, 2 * 1 13 5 2 * 0 - - - * * which says that the first nine blocks (0-8) are interleaved over * 3 disks (0, 1, 2) starting at block offset 0 on any component disk, * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting * at component block 3, and the remaining blocks (13-14) are on disk * 2 starting at offset 5. */ struct ccdiinfo { int ii_ndisk; /* # of disks range is interleaved over */ daddr_t ii_startblk; /* starting scaled block # for range */ daddr_t ii_startoff; /* starting component offset (block #) */ int *ii_index; /* ordered list of components in range */ }; /* * Component info table. * Describes a single component of a concatenated disk. */ struct ccdcinfo { daddr_t ci_size; /* size */ struct g_provider *ci_provider; /* provider */ struct g_consumer *ci_consumer; /* consumer */ }; /* * A concatenated disk is described by this structure. */ struct ccd_s { LIST_ENTRY(ccd_s) list; int sc_unit; /* logical unit number */ int sc_flags; /* flags */ daddr_t sc_size; /* size of ccd */ int sc_ileave; /* interleave */ u_int sc_ndisks; /* number of components */ struct ccdcinfo *sc_cinfo; /* component info */ struct ccdiinfo *sc_itable; /* interleave table */ u_int32_t sc_secsize; /* # bytes per sector */ int sc_pick; /* side of mirror picked */ daddr_t sc_blk[2]; /* mirror localization */ u_int32_t sc_offset; /* actual offset used */ }; static g_start_t g_ccd_start; static void ccdiodone(struct bio *bp); static void ccdinterleave(struct ccd_s *); static int ccdinit(struct gctl_req *req, struct ccd_s *); static int ccdbuffer(struct bio **ret, struct ccd_s *, struct bio *, daddr_t, caddr_t, long); static void g_ccd_orphan(struct g_consumer *cp) { /* * XXX: We don't do anything here. It is not obvious * XXX: what DTRT would be, so we do what the previous * XXX: code did: ignore it and let the user cope. */ } static int g_ccd_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp1, *cp2; int error; de += dr; de += dw; gp = pp->geom; error = ENXIO; LIST_FOREACH(cp1, &gp->consumer, consumer) { error = g_access(cp1, dr, dw, de); if (error) { LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } break; } } return (error); } /* * Free the softc and its substructures. */ static void g_ccd_freesc(struct ccd_s *sc) { struct ccdiinfo *ii; g_free(sc->sc_cinfo); if (sc->sc_itable != NULL) { for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++) if (ii->ii_index != NULL) g_free(ii->ii_index); g_free(sc->sc_itable); } g_free(sc); } static int ccdinit(struct gctl_req *req, struct ccd_s *cs) { struct ccdcinfo *ci; daddr_t size; int ix; daddr_t minsize; int maxsecsize; off_t mediasize; u_int sectorsize; cs->sc_size = 0; maxsecsize = 0; minsize = 0; if (cs->sc_flags & CCDF_LINUX) { cs->sc_offset = 0; cs->sc_ileave *= 2; if (cs->sc_flags & CCDF_MIRROR && cs->sc_ndisks != 2) gctl_error(req, "Mirror mode for Linux raids is " "only supported with 2 devices"); } else { if (cs->sc_flags & CCDF_NO_OFFSET) cs->sc_offset = 0; else cs->sc_offset = CCD_OFFSET; } for (ix = 0; ix < cs->sc_ndisks; ix++) { ci = &cs->sc_cinfo[ix]; mediasize = ci->ci_provider->mediasize; sectorsize = ci->ci_provider->sectorsize; if (sectorsize > maxsecsize) maxsecsize = sectorsize; size = mediasize / DEV_BSIZE - cs->sc_offset; /* Truncate to interleave boundary */ if (cs->sc_ileave > 1) size -= size % cs->sc_ileave; if (size == 0) { gctl_error(req, "Component %s has effective size zero", ci->ci_provider->name); return(ENODEV); } if (minsize == 0 || size < minsize) minsize = size; ci->ci_size = size; cs->sc_size += size; } /* * Don't allow the interleave to be smaller than * the biggest component sector. */ if ((cs->sc_ileave > 0) && (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { gctl_error(req, "Interleave to small for sector size"); return(EINVAL); } /* * If uniform interleave is desired set all sizes to that of * the smallest component. This will guarantee that a single * interleave table is generated. * * Lost space must be taken into account when calculating the * overall size. Half the space is lost when CCDF_MIRROR is * specified. */ if (cs->sc_flags & CCDF_UNIFORM) { for (ix = 0; ix < cs->sc_ndisks; ix++) { ci = &cs->sc_cinfo[ix]; ci->ci_size = minsize; } cs->sc_size = cs->sc_ndisks * minsize; } if (cs->sc_flags & CCDF_MIRROR) { /* * Check to see if an even number of components * have been specified. The interleave must also * be non-zero in order for us to be able to * guarantee the topology. */ if (cs->sc_ndisks % 2) { gctl_error(req, "Mirroring requires an even number of disks"); return(EINVAL); } if (cs->sc_ileave == 0) { gctl_error(req, "An interleave must be specified when mirroring"); return(EINVAL); } cs->sc_size = (cs->sc_ndisks/2) * minsize; } /* * Construct the interleave table. */ ccdinterleave(cs); /* * Create pseudo-geometry based on 1MB cylinders. It's * pretty close. */ cs->sc_secsize = maxsecsize; return (0); } static void ccdinterleave(struct ccd_s *cs) { struct ccdcinfo *ci, *smallci; struct ccdiinfo *ii; daddr_t bn, lbn; int ix; daddr_t size; /* * Allocate an interleave table. The worst case occurs when each * of N disks is of a different size, resulting in N interleave * tables. * * Chances are this is too big, but we don't care. */ size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo); cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO); /* * Trivial case: no interleave (actually interleave of disk size). * Each table entry represents a single component in its entirety. * * An interleave of 0 may not be used with a mirror setup. */ if (cs->sc_ileave == 0) { bn = 0; ii = cs->sc_itable; for (ix = 0; ix < cs->sc_ndisks; ix++) { /* Allocate space for ii_index. */ ii->ii_index = g_malloc(sizeof(int), M_WAITOK); ii->ii_ndisk = 1; ii->ii_startblk = bn; ii->ii_startoff = 0; ii->ii_index[0] = ix; bn += cs->sc_cinfo[ix].ci_size; ii++; } ii->ii_ndisk = 0; return; } /* * The following isn't fast or pretty; it doesn't have to be. */ size = 0; bn = lbn = 0; for (ii = cs->sc_itable; ; ii++) { /* * Allocate space for ii_index. We might allocate more then * we use. */ ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks), M_WAITOK); /* * Locate the smallest of the remaining components */ smallci = NULL; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) { if (ci->ci_size > size && (smallci == NULL || ci->ci_size < smallci->ci_size)) { smallci = ci; } } /* * Nobody left, all done */ if (smallci == NULL) { ii->ii_ndisk = 0; g_free(ii->ii_index); ii->ii_index = NULL; break; } /* * Record starting logical block using an sc_ileave blocksize. */ ii->ii_startblk = bn / cs->sc_ileave; /* * Record starting component block using an sc_ileave * blocksize. This value is relative to the beginning of * a component disk. */ ii->ii_startoff = lbn; /* * Determine how many disks take part in this interleave * and record their indices. */ ix = 0; for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) { if (ci->ci_size >= smallci->ci_size) { ii->ii_index[ix++] = ci - cs->sc_cinfo; } } ii->ii_ndisk = ix; bn += ix * (smallci->ci_size - size); lbn = smallci->ci_size / cs->sc_ileave; size = smallci->ci_size; } } static void g_ccd_start(struct bio *bp) { long bcount, rcount; struct bio *cbp[2]; caddr_t addr; daddr_t bn; int err; struct ccd_s *cs; cs = bp->bio_to->geom->softc; /* * Block all GETATTR requests, we wouldn't know which of our * subdevices we should ship it off to. * XXX: this may not be the right policy. */ if(bp->bio_cmd == BIO_GETATTR) { g_io_deliver(bp, EINVAL); return; } /* * Translate the partition-relative block number to an absolute. */ bn = bp->bio_offset / cs->sc_secsize; /* * Allocate component buffers and fire off the requests */ addr = bp->bio_data; for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); if (err) { bp->bio_completed += bcount; if (bp->bio_error == 0) bp->bio_error = err; if (bp->bio_completed == bp->bio_length) g_io_deliver(bp, bp->bio_error); return; } rcount = cbp[0]->bio_length; if (cs->sc_flags & CCDF_MIRROR) { /* * Mirroring. Writes go to both disks, reads are * taken from whichever disk seems most appropriate. * * We attempt to localize reads to the disk whos arm * is nearest the read request. We ignore seeks due * to writes when making this determination and we * also try to avoid hogging. */ if (cbp[0]->bio_cmd != BIO_READ) { g_io_request(cbp[0], cbp[0]->bio_from); g_io_request(cbp[1], cbp[1]->bio_from); } else { int pick = cs->sc_pick; daddr_t range = cs->sc_size / 16; if (bn < cs->sc_blk[pick] - range || bn > cs->sc_blk[pick] + range ) { cs->sc_pick = pick = 1 - pick; } cs->sc_blk[pick] = bn + btodb(rcount); g_io_request(cbp[pick], cbp[pick]->bio_from); } } else { /* * Not mirroring */ g_io_request(cbp[0], cbp[0]->bio_from); } bn += btodb(rcount); addr += rcount; } } /* * Build a component buffer header. */ static int ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) { struct ccdcinfo *ci, *ci2 = NULL; struct bio *cbp; daddr_t cbn, cboff; off_t cbc; /* * Determine which component bn falls in. */ cbn = bn; cboff = 0; if (cs->sc_ileave == 0) { /* * Serially concatenated and neither a mirror nor a parity * config. This is a special case. */ daddr_t sblk; sblk = 0; for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) sblk += ci->ci_size; cbn -= sblk; } else { struct ccdiinfo *ii; int ccdisk, off; /* * Calculate cbn, the logical superblock (sc_ileave chunks), * and cboff, a normal block offset (DEV_BSIZE chunks) relative * to cbn. */ cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ /* * Figure out which interleave table to use. */ for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { if (ii->ii_startblk > cbn) break; } ii--; /* * off is the logical superblock relative to the beginning * of this interleave block. */ off = cbn - ii->ii_startblk; /* * We must calculate which disk component to use (ccdisk), * and recalculate cbn to be the superblock relative to * the beginning of the component. This is typically done by * adding 'off' and ii->ii_startoff together. However, 'off' * must typically be divided by the number of components in * this interleave array to be properly convert it from a * CCD-relative logical superblock number to a * component-relative superblock number. */ if (ii->ii_ndisk == 1) { /* * When we have just one disk, it can't be a mirror * or a parity config. */ ccdisk = ii->ii_index[0]; cbn = ii->ii_startoff + off; } else { if (cs->sc_flags & CCDF_MIRROR) { /* * We have forced a uniform mapping, resulting * in a single interleave array. We double * up on the first half of the available * components and our mirror is in the second * half. This only works with a single * interleave array because doubling up * doubles the number of sectors, so there * cannot be another interleave array because * the next interleave array's calculations * would be off. */ int ndisk2 = ii->ii_ndisk / 2; ccdisk = ii->ii_index[off % ndisk2]; cbn = ii->ii_startoff + off / ndisk2; ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; } else { ccdisk = ii->ii_index[off % ii->ii_ndisk]; cbn = ii->ii_startoff + off / ii->ii_ndisk; } } ci = &cs->sc_cinfo[ccdisk]; /* * Convert cbn from a superblock to a normal block so it * can be used to calculate (along with cboff) the normal * block index into this particular disk. */ cbn *= cs->sc_ileave; } /* * Fill in the component buf structure. */ cbp = g_clone_bio(bp); if (cbp == NULL) return (ENOMEM); cbp->bio_done = g_std_done; cbp->bio_offset = dbtob(cbn + cboff + cs->sc_offset); cbp->bio_data = addr; if (cs->sc_ileave == 0) cbc = dbtob((off_t)(ci->ci_size - cbn)); else cbc = dbtob((off_t)(cs->sc_ileave - cboff)); cbp->bio_length = (cbc < bcount) ? cbc : bcount; cbp->bio_from = ci->ci_consumer; cb[0] = cbp; if (cs->sc_flags & CCDF_MIRROR) { cbp = g_clone_bio(bp); if (cbp == NULL) return (ENOMEM); cbp->bio_done = cb[0]->bio_done = ccdiodone; cbp->bio_offset = cb[0]->bio_offset; cbp->bio_data = cb[0]->bio_data; cbp->bio_length = cb[0]->bio_length; cbp->bio_from = ci2->ci_consumer; cbp->bio_caller1 = cb[0]; cb[0]->bio_caller1 = cbp; cb[1] = cbp; } return (0); } /* * Called only for mirrored operations. */ static void ccdiodone(struct bio *cbp) { struct bio *mbp, *pbp; mbp = cbp->bio_caller1; pbp = cbp->bio_parent; if (pbp->bio_cmd == BIO_READ) { if (cbp->bio_error == 0) { /* We will not be needing the partner bio */ if (mbp != NULL) { pbp->bio_inbed++; g_destroy_bio(mbp); } g_std_done(cbp); return; } if (mbp != NULL) { /* Try partner the bio instead */ mbp->bio_caller1 = NULL; pbp->bio_inbed++; g_destroy_bio(cbp); g_io_request(mbp, mbp->bio_from); /* * XXX: If this comes back OK, we should actually * try to write the good data on the failed mirror */ return; } g_std_done(cbp); return; } if (mbp != NULL) { mbp->bio_caller1 = NULL; pbp->bio_inbed++; if (cbp->bio_error != 0 && pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; g_destroy_bio(cbp); return; } g_std_done(cbp); } static void g_ccd_create(struct gctl_req *req, struct g_class *mp) { int *unit, *ileave, *nprovider; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct ccd_s *sc; struct sbuf *sb; char buf[20]; int i, error; g_topology_assert(); unit = gctl_get_paraml(req, "unit", sizeof (*unit)); if (unit == NULL) { gctl_error(req, "unit parameter not given"); return; } ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave)); if (ileave == NULL) { gctl_error(req, "ileave parameter not given"); return; } nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider)); if (nprovider == NULL) { gctl_error(req, "nprovider parameter not given"); return; } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && sc->sc_unit == *unit) { gctl_error(req, "Unit %d already configured", *unit); return; } } if (*nprovider <= 0) { gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider); return; } /* Check all providers are valid */ for (i = 0; i < *nprovider; i++) { sprintf(buf, "provider%d", i); pp = gctl_get_provider(req, buf); if (pp == NULL) return; } gp = g_new_geomf(mp, "ccd%d", *unit); sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO); gp->softc = sc; sc->sc_ndisks = *nprovider; /* Allocate space for the component info. */ sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo), M_WAITOK | M_ZERO); /* Create consumers and attach to all providers */ for (i = 0; i < *nprovider; i++) { sprintf(buf, "provider%d", i); pp = gctl_get_provider(req, buf); cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("attach to %s failed", pp->name)); sc->sc_cinfo[i].ci_consumer = cp; sc->sc_cinfo[i].ci_provider = pp; } sc->sc_unit = *unit; sc->sc_ileave = *ileave; if (gctl_get_param(req, "no_offset", NULL)) sc->sc_flags |= CCDF_NO_OFFSET; if (gctl_get_param(req, "linux", NULL)) sc->sc_flags |= CCDF_LINUX; if (gctl_get_param(req, "uniform", NULL)) sc->sc_flags |= CCDF_UNIFORM; if (gctl_get_param(req, "mirror", NULL)) sc->sc_flags |= CCDF_MIRROR; if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) { printf("%s: disabling mirror, interleave is 0\n", gp->name); sc->sc_flags &= ~(CCDF_MIRROR); } if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) { printf("%s: mirror/parity forces uniform flag\n", gp->name); sc->sc_flags |= CCDF_UNIFORM; } error = ccdinit(req, sc); if (error != 0) { g_ccd_freesc(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return; } pp = g_new_providerf(gp, "%s", gp->name); pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize; pp->sectorsize = sc->sc_secsize; g_error_provider(pp, 0); sb = sbuf_new_auto(); sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider); for (i = 0; i < *nprovider; i++) { sbuf_printf(sb, "%s%s", i == 0 ? "(" : ", ", sc->sc_cinfo[i].ci_provider->name); } sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE); if (sc->sc_ileave != 0) sbuf_printf(sb, "interleaved at %d blocks\n", sc->sc_ileave); else sbuf_printf(sb, "concatenated\n"); sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static int g_ccd_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_provider *pp; struct ccd_s *sc; g_topology_assert(); sc = gp->softc; pp = LIST_FIRST(&gp->provider); if (sc == NULL || pp == NULL) return (EBUSY); if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { gctl_error(req, "%s is open(r%dw%de%d)", gp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_ccd_freesc(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static void g_ccd_list(struct gctl_req *req, struct g_class *mp) { struct sbuf *sb; struct ccd_s *cs; struct g_geom *gp; int i, unit, *up; up = gctl_get_paraml(req, "unit", sizeof (*up)); if (up == NULL) { gctl_error(req, "unit parameter not given"); return; } unit = *up; sb = sbuf_new_auto(); LIST_FOREACH(gp, &mp->geom, geom) { cs = gp->softc; if (cs == NULL || (unit >= 0 && unit != cs->sc_unit)) continue; sbuf_printf(sb, "ccd%d\t\t%d\t%d\t", cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK); for (i = 0; i < cs->sc_ndisks; ++i) { sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ", cs->sc_cinfo[i].ci_provider->name); } sbuf_printf(sb, "\n"); } sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static void g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; g_topology_assert(); if (!strcmp(verb, "create geom")) { g_ccd_create(req, mp); } else if (!strcmp(verb, "destroy geom")) { gp = gctl_get_geom(req, mp, "geom"); if (gp != NULL) g_ccd_destroy_geom(req, mp, gp); } else if (!strcmp(verb, "list")) { g_ccd_list(req, mp); } else { gctl_error(req, "unknown verb"); } } static struct g_class g_ccd_class = { .name = "CCD", .version = G_VERSION, .ctlreq = g_ccd_config, .destroy_geom = g_ccd_destroy_geom, .start = g_ccd_start, .orphan = g_ccd_orphan, .access = g_ccd_access, }; DECLARE_GEOM_CLASS(g_ccd_class, g_ccd); Index: head/sys/geom/geom_ctl.c =================================================================== --- head/sys/geom/geom_ctl.c (revision 326269) +++ head/sys/geom/geom_ctl.c (revision 326270) @@ -1,513 +1,515 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define GCTL_TABLE 1 #include #include static d_ioctl_t g_ctl_ioctl; static struct cdevsw g_ctl_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_ioctl = g_ctl_ioctl, .d_name = "g_ctl", }; void g_ctl_init(void) { make_dev_credf(MAKEDEV_ETERNAL, &g_ctl_cdevsw, 0, NULL, UID_ROOT, GID_OPERATOR, 0640, PATH_GEOM_CTL); KASSERT(GCTL_PARAM_RD == VM_PROT_READ, ("GCTL_PARAM_RD != VM_PROT_READ")); KASSERT(GCTL_PARAM_WR == VM_PROT_WRITE, ("GCTL_PARAM_WR != VM_PROT_WRITE")); } /* * Report an error back to the user in ascii format. Return nerror * or EINVAL if nerror isn't specified. */ int gctl_error(struct gctl_req *req, const char *fmt, ...) { va_list ap; if (req == NULL) return (EINVAL); /* We only record the first error */ if (sbuf_done(req->serror)) { if (!req->nerror) req->nerror = EEXIST; return (req->nerror); } if (!req->nerror) req->nerror = EINVAL; va_start(ap, fmt); sbuf_vprintf(req->serror, fmt, ap); va_end(ap); sbuf_finish(req->serror); if (g_debugflags & G_F_CTLDUMP) printf("gctl %p error \"%s\"\n", req, sbuf_data(req->serror)); return (req->nerror); } /* * Allocate space and copyin() something. * XXX: this should really be a standard function in the kernel. */ static void * geom_alloc_copyin(struct gctl_req *req, void *uaddr, size_t len) { void *ptr; ptr = g_malloc(len, M_WAITOK); req->nerror = copyin(uaddr, ptr, len); if (!req->nerror) return (ptr); g_free(ptr); return (NULL); } static void gctl_copyin(struct gctl_req *req) { struct gctl_req_arg *ap; char *p; u_int i; ap = geom_alloc_copyin(req, req->arg, req->narg * sizeof(*ap)); if (ap == NULL) { gctl_error(req, "bad control request"); req->arg = NULL; return; } /* Nothing have been copyin()'ed yet */ for (i = 0; i < req->narg; i++) { ap[i].flag &= ~(GCTL_PARAM_NAMEKERNEL|GCTL_PARAM_VALUEKERNEL); ap[i].flag &= ~GCTL_PARAM_CHANGED; ap[i].kvalue = NULL; } for (i = 0; i < req->narg; i++) { if (ap[i].nlen < 1 || ap[i].nlen > SPECNAMELEN) { gctl_error(req, "wrong param name length %d: %d", i, ap[i].nlen); break; } p = geom_alloc_copyin(req, ap[i].name, ap[i].nlen); if (p == NULL) break; if (p[ap[i].nlen - 1] != '\0') { gctl_error(req, "unterminated param name"); g_free(p); break; } ap[i].name = p; ap[i].flag |= GCTL_PARAM_NAMEKERNEL; if (ap[i].len <= 0) { gctl_error(req, "negative param length"); break; } p = geom_alloc_copyin(req, ap[i].value, ap[i].len); if (p == NULL) break; if ((ap[i].flag & GCTL_PARAM_ASCII) && p[ap[i].len - 1] != '\0') { gctl_error(req, "unterminated param value"); g_free(p); break; } ap[i].kvalue = p; ap[i].flag |= GCTL_PARAM_VALUEKERNEL; } req->arg = ap; return; } static void gctl_copyout(struct gctl_req *req) { int error, i; struct gctl_req_arg *ap; if (req->nerror) return; error = 0; ap = req->arg; for (i = 0; i < req->narg; i++, ap++) { if (!(ap->flag & GCTL_PARAM_CHANGED)) continue; error = copyout(ap->kvalue, ap->value, ap->len); if (!error) continue; req->nerror = error; return; } return; } static void gctl_free(struct gctl_req *req) { u_int i; sbuf_delete(req->serror); if (req->arg == NULL) return; for (i = 0; i < req->narg; i++) { if (req->arg[i].flag & GCTL_PARAM_NAMEKERNEL) g_free(req->arg[i].name); if ((req->arg[i].flag & GCTL_PARAM_VALUEKERNEL) && req->arg[i].len > 0) g_free(req->arg[i].kvalue); } g_free(req->arg); } static void gctl_dump(struct gctl_req *req) { struct gctl_req_arg *ap; u_int i; int j; printf("Dump of gctl request at %p:\n", req); if (req->nerror > 0) { printf(" nerror:\t%d\n", req->nerror); if (sbuf_len(req->serror) > 0) printf(" error:\t\"%s\"\n", sbuf_data(req->serror)); } if (req->arg == NULL) return; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (!(ap->flag & GCTL_PARAM_NAMEKERNEL)) printf(" param:\t%d@%p", ap->nlen, ap->name); else printf(" param:\t\"%s\"", ap->name); printf(" [%s%s%d] = ", ap->flag & GCTL_PARAM_RD ? "R" : "", ap->flag & GCTL_PARAM_WR ? "W" : "", ap->len); if (!(ap->flag & GCTL_PARAM_VALUEKERNEL)) { printf(" =@ %p", ap->value); } else if (ap->flag & GCTL_PARAM_ASCII) { printf("\"%s\"", (char *)ap->kvalue); } else if (ap->len > 0) { for (j = 0; j < ap->len && j < 512; j++) printf(" %02x", ((u_char *)ap->kvalue)[j]); } else { printf(" = %p", ap->kvalue); } printf("\n"); } } int gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len) { u_int i; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_WR)) return (EPERM); ap->flag |= GCTL_PARAM_CHANGED; if (ap->len < len) { bcopy(ptr, ap->kvalue, ap->len); return (ENOSPC); } bcopy(ptr, ap->kvalue, len); return (0); } return (EINVAL); } void gctl_set_param_err(struct gctl_req *req, const char *param, void const *ptr, int len) { switch (gctl_set_param(req, param, ptr, len)) { case EPERM: gctl_error(req, "No write access %s argument", param); break; case ENOSPC: gctl_error(req, "Wrong length %s argument", param); break; case EINVAL: gctl_error(req, "Missing %s argument", param); break; } } void * gctl_get_param(struct gctl_req *req, const char *param, int *len) { u_int i; void *p; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_RD)) continue; p = ap->kvalue; if (len != NULL) *len = ap->len; return (p); } return (NULL); } char const * gctl_get_asciiparam(struct gctl_req *req, const char *param) { u_int i; char const *p; struct gctl_req_arg *ap; for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; if (strcmp(param, ap->name)) continue; if (!(ap->flag & GCTL_PARAM_RD)) continue; p = ap->kvalue; if (ap->len < 1) { gctl_error(req, "No length argument (%s)", param); return (NULL); } if (p[ap->len - 1] != '\0') { gctl_error(req, "Unterminated argument (%s)", param); return (NULL); } return (p); } return (NULL); } void * gctl_get_paraml(struct gctl_req *req, const char *param, int len) { int i; void *p; p = gctl_get_param(req, param, &i); if (p == NULL) gctl_error(req, "Missing %s argument", param); else if (i != len) { p = NULL; gctl_error(req, "Wrong length %s argument", param); } return (p); } struct g_class * gctl_get_class(struct gctl_req *req, char const *arg) { char const *p; struct g_class *cp; p = gctl_get_asciiparam(req, arg); if (p == NULL) return (NULL); LIST_FOREACH(cp, &g_classes, class) { if (!strcmp(p, cp->name)) return (cp); } return (NULL); } struct g_geom * gctl_get_geom(struct gctl_req *req, struct g_class *mpr, char const *arg) { char const *p; struct g_class *mp; struct g_geom *gp; p = gctl_get_asciiparam(req, arg); if (p == NULL) return (NULL); LIST_FOREACH(mp, &g_classes, class) { if (mpr != NULL && mpr != mp) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (!strcmp(p, gp->name)) return (gp); } } gctl_error(req, "Geom not found: \"%s\"", p); return (NULL); } struct g_provider * gctl_get_provider(struct gctl_req *req, char const *arg) { char const *p; struct g_provider *pp; p = gctl_get_asciiparam(req, arg); if (p == NULL) return (NULL); pp = g_provider_by_name(p); if (pp != NULL) return (pp); gctl_error(req, "Provider not found: \"%s\"", p); return (NULL); } static void g_ctl_req(void *arg, int flag __unused) { struct g_class *mp; struct gctl_req *req; char const *verb; g_topology_assert(); req = arg; mp = gctl_get_class(req, "class"); if (mp == NULL) { gctl_error(req, "Class not found"); return; } if (mp->ctlreq == NULL) { gctl_error(req, "Class takes no requests"); return; } verb = gctl_get_param(req, "verb", NULL); if (verb == NULL) { gctl_error(req, "Verb missing"); return; } mp->ctlreq(req, mp, verb); g_topology_assert(); } static int g_ctl_ioctl_ctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct gctl_req *req; int nerror; req = (void *)data; req->nerror = 0; /* It is an error if we cannot return an error text */ if (req->lerror < 2) return (EINVAL); if (!useracc(req->error, req->lerror, VM_PROT_WRITE)) return (EINVAL); req->serror = sbuf_new_auto(); /* Check the version */ if (req->version != GCTL_VERSION) { gctl_error(req, "kernel and libgeom version mismatch."); req->arg = NULL; } else { /* Get things on board */ gctl_copyin(req); if (g_debugflags & G_F_CTLDUMP) gctl_dump(req); if (!req->nerror) { g_waitfor_event(g_ctl_req, req, M_WAITOK, NULL); gctl_copyout(req); } } if (sbuf_done(req->serror)) { copyout(sbuf_data(req->serror), req->error, imin(req->lerror, sbuf_len(req->serror) + 1)); } nerror = req->nerror; gctl_free(req); return (nerror); } static int g_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error; switch(cmd) { case GEOM_CTL: error = g_ctl_ioctl_ctl(dev, cmd, data, fflag, td); break; default: error = ENOIOCTL; break; } return (error); } Index: head/sys/geom/geom_ctl.h =================================================================== --- head/sys/geom/geom_ctl.h (revision 326269) +++ head/sys/geom/geom_ctl.h (revision 326270) @@ -1,82 +1,84 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_CTL_H_ #define _GEOM_GEOM_CTL_H_ #include /* * Version number. Used to check consistency between kernel and libgeom. */ #define GCTL_VERSION 2 struct gctl_req_arg { u_int nlen; char *name; off_t offset; int flag; int len; void *value; /* kernel only fields */ void *kvalue; }; #define GCTL_PARAM_RD 1 /* Must match VM_PROT_READ */ #define GCTL_PARAM_WR 2 /* Must match VM_PROT_WRITE */ #define GCTL_PARAM_RW (GCTL_PARAM_RD | GCTL_PARAM_WR) #define GCTL_PARAM_ASCII 4 /* These are used in the kernel only */ #define GCTL_PARAM_NAMEKERNEL 8 #define GCTL_PARAM_VALUEKERNEL 16 #define GCTL_PARAM_CHANGED 32 struct gctl_req { u_int version; u_int serial; u_int narg; struct gctl_req_arg *arg; u_int lerror; char *error; struct gctl_req_table *reqt; /* kernel only fields */ int nerror; struct sbuf *serror; }; #define GEOM_CTL _IOW('G', GCTL_VERSION, struct gctl_req) #define PATH_GEOM_CTL "geom.ctl" #endif /* _GEOM_GEOM_CTL_H_ */ Index: head/sys/geom/geom_dev.c =================================================================== --- head/sys/geom/geom_dev.c (revision 326269) +++ head/sys/geom/geom_dev.c (revision 326270) @@ -1,842 +1,844 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_dev_softc { struct mtx sc_mtx; struct cdev *sc_dev; struct cdev *sc_alias; int sc_open; int sc_active; }; static d_open_t g_dev_open; static d_close_t g_dev_close; static d_strategy_t g_dev_strategy; static d_ioctl_t g_dev_ioctl; static struct cdevsw g_dev_cdevsw = { .d_version = D_VERSION, .d_open = g_dev_open, .d_close = g_dev_close, .d_read = physread, .d_write = physwrite, .d_ioctl = g_dev_ioctl, .d_strategy = g_dev_strategy, .d_name = "g_dev", .d_flags = D_DISK | D_TRACKCLOSE, }; static g_init_t g_dev_init; static g_fini_t g_dev_fini; static g_taste_t g_dev_taste; static g_orphan_t g_dev_orphan; static g_attrchanged_t g_dev_attrchanged; static struct g_class g_dev_class = { .name = "DEV", .version = G_VERSION, .init = g_dev_init, .fini = g_dev_fini, .taste = g_dev_taste, .orphan = g_dev_orphan, .attrchanged = g_dev_attrchanged }; /* * We target 262144 (8 x 32768) sectors by default as this significantly * increases the throughput on commonly used SSD's with a marginal * increase in non-interruptible request latency. */ static uint64_t g_dev_del_max_sectors = 262144; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW, 0, "GEOM_DEV stuff"); SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW, &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single " "delete request sent to the provider. Larger requests are chunked " "so they can be interrupted. (0 = disable chunking)"); static char *dumpdev = NULL; static void g_dev_init(struct g_class *mp) { dumpdev = kern_getenv("dumpdev"); } static void g_dev_fini(struct g_class *mp) { freeenv(dumpdev); dumpdev = NULL; } static int g_dev_setdumpdev(struct cdev *dev, struct diocskerneldump_arg *kda, struct thread *td) { struct g_kerneldump kd; struct g_consumer *cp; int error, len; if (dev == NULL || kda == NULL) return (set_dumper(NULL, NULL, td, 0, 0, NULL, 0, NULL)); cp = dev->si_drv2; len = sizeof(kd); kd.offset = 0; kd.length = OFF_MAX; error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd); if (error != 0) return (error); error = set_dumper(&kd.di, devtoname(dev), td, kda->kda_compression, kda->kda_encryption, kda->kda_key, kda->kda_encryptedkeysize, kda->kda_encryptedkey); if (error == 0) dev->si_flags |= SI_DUMPDEV; return (error); } static int init_dumpdev(struct cdev *dev) { struct diocskerneldump_arg kda; struct g_consumer *cp; const char *devprefix = "/dev/", *devname; int error; size_t len; bzero(&kda, sizeof(kda)); kda.kda_enable = 1; if (dumpdev == NULL) return (0); len = strlen(devprefix); devname = devtoname(dev); if (strcmp(devname, dumpdev) != 0 && (strncmp(dumpdev, devprefix, len) != 0 || strcmp(devname, dumpdev + len) != 0)) return (0); cp = (struct g_consumer *)dev->si_drv2; error = g_access(cp, 1, 0, 0); if (error != 0) return (error); error = g_dev_setdumpdev(dev, &kda, curthread); if (error == 0) { freeenv(dumpdev); dumpdev = NULL; } (void)g_access(cp, -1, 0, 0); return (error); } static void g_dev_destroy(void *arg, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_dev_softc *sc; char buf[SPECNAMELEN + 6]; g_topology_assert(); cp = arg; gp = cp->geom; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "DESTROY", buf, M_WAITOK); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); } void g_dev_print(void) { struct g_geom *gp; char const *p = ""; LIST_FOREACH(gp, &g_dev_class.geom, geom) { printf("%s%s", p, gp->name); p = " "; } printf("\n"); } static void g_dev_set_physpath(struct g_consumer *cp) { struct g_dev_softc *sc; char *physpath; int error, physpath_len; if (g_access(cp, 1, 0, 0) != 0) return; sc = cp->private; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0 && strlen(physpath) != 0) { struct cdev *dev, *old_alias_dev; struct cdev **alias_devp; dev = sc->sc_dev; old_alias_dev = sc->sc_alias; alias_devp = (struct cdev **)&sc->sc_alias; make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp, dev, old_alias_dev, physpath); } else if (sc->sc_alias) { destroy_dev((struct cdev *)sc->sc_alias); sc->sc_alias = NULL; } g_free(physpath); } static void g_dev_set_media(struct g_consumer *cp) { struct g_dev_softc *sc; struct cdev *dev; char buf[SPECNAMELEN + 6]; sc = cp->private; dev = sc->sc_dev; snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); dev = sc->sc_alias; if (dev != NULL) { snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); } } static void g_dev_attrchanged(struct g_consumer *cp, const char *attr) { if (strcmp(attr, "GEOM::media") == 0) { g_dev_set_media(cp); return; } if (strcmp(attr, "GEOM::physpath") == 0) { g_dev_set_physpath(cp); return; } } struct g_provider * g_dev_getprovider(struct cdev *dev) { struct g_consumer *cp; g_topology_assert(); if (dev == NULL) return (NULL); if (dev->si_devsw != &g_dev_cdevsw) return (NULL); cp = dev->si_drv2; return (cp->provider); } static struct g_geom * g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_geom_alias *gap; struct g_consumer *cp; struct g_dev_softc *sc; int error; struct cdev *dev, *adev; char buf[SPECNAMELEN + 6]; g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_new_geomf(mp, "%s", pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF); cp = g_new_consumer(gp); cp->private = sc; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); KASSERT(error == 0, ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error)); error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev, &g_dev_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0640, "%s", gp->name); if (error != 0) { printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); return (NULL); } dev->si_flags |= SI_UNMAPPED; sc->sc_dev = dev; dev->si_iosize_max = MAXPHYS; dev->si_drv2 = cp; error = init_dumpdev(dev); if (error != 0) printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_dev_attrchanged(cp, "GEOM::physpath"); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK); /* * Now add all the aliases for this drive */ LIST_FOREACH(gap, &pp->geom->aliases, ga_next) { error = make_dev_alias_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &adev, dev, "%s", gap->ga_alias); if (error) { printf("%s: make_dev_alias_p() failed (name=%s, error=%d)\n", __func__, gap->ga_alias, error); continue; } snprintf(buf, sizeof(buf), "cdev=%s", gap->ga_alias); devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK); } return (gp); } static int g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; if (cp == NULL) return (ENXIO); /* g_dev_taste() not done yet */ g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? 1 : 0; w = flags & FWRITE ? 1 : 0; #ifdef notyet e = flags & O_EXCL ? 1 : 0; #else e = 0; #endif /* * This happens on attempt to open a device node with O_EXEC. */ if (r + w + e == 0) return (EINVAL); if (w) { /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); if (error == 0) { sc = cp->private; mtx_lock(&sc->sc_mtx); if (sc->sc_open == 0 && sc->sc_active != 0) wakeup(&sc->sc_active); sc->sc_open += r + w + e; mtx_unlock(&sc->sc_mtx); } return (error); } static int g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; if (cp == NULL) return (ENXIO); g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? -1 : 0; w = flags & FWRITE ? -1 : 0; #ifdef notyet e = flags & O_EXCL ? -1 : 0; #else e = 0; #endif /* * The vgonel(9) - caused by eg. forced unmount of devfs - calls * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags, * which would result in zero deltas, which in turn would cause * panic in g_access(9). * * Note that we cannot zero the counters (ie. do "r = cp->acr" * etc) instead, because the consumer might be opened in another * devfs instance. */ if (r + w + e == 0) return (EINVAL); sc = cp->private; mtx_lock(&sc->sc_mtx); sc->sc_open += r + w + e; while (sc->sc_open == 0 && sc->sc_active != 0) msleep(&sc->sc_active, &sc->sc_mtx, 0, "PRIBIO", 0); mtx_unlock(&sc->sc_mtx); g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); return (error); } /* * XXX: Until we have unmessed the ioctl situation, there is a race against * XXX: a concurrent orphanization. We cannot close it by holding topology * XXX: since that would prevent us from doing our job, and stalling events * XXX: will break (actually: stall) the BSD disklabel hacks. */ static int g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_provider *pp; off_t offset, length, chunk, odd; int i, error; cp = dev->si_drv2; pp = cp->provider; error = 0; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_ioctl")); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = cp->provider->sectorsize; if (*(u_int *)data == 0) error = ENOENT; break; case DIOCGMEDIASIZE: *(off_t *)data = cp->provider->mediasize; if (*(off_t *)data == 0) error = ENOENT; break; case DIOCGFWSECTORS: error = g_io_getattr("GEOM::fwsectors", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFWHEADS: error = g_io_getattr("GEOM::fwheads", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFRONTSTUFF: error = g_io_getattr("GEOM::frontstuff", cp, &i, data); break; #ifdef COMPAT_FREEBSD11 case DIOCSKERNELDUMP_FREEBSD11: { struct diocskerneldump_arg kda; bzero(&kda, sizeof(kda)); kda.kda_encryption = KERNELDUMP_ENC_NONE; kda.kda_enable = (uint8_t)*(u_int *)data; if (kda.kda_enable == 0) error = g_dev_setdumpdev(NULL, NULL, td); else error = g_dev_setdumpdev(dev, &kda, td); break; } #endif case DIOCSKERNELDUMP: { struct diocskerneldump_arg *kda; uint8_t *encryptedkey; kda = (struct diocskerneldump_arg *)data; if (kda->kda_enable == 0) { error = g_dev_setdumpdev(NULL, NULL, td); break; } if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { if (kda->kda_encryptedkeysize <= 0 || kda->kda_encryptedkeysize > KERNELDUMP_ENCKEY_MAX_SIZE) { return (EINVAL); } encryptedkey = malloc(kda->kda_encryptedkeysize, M_TEMP, M_WAITOK); error = copyin(kda->kda_encryptedkey, encryptedkey, kda->kda_encryptedkeysize); } else { encryptedkey = NULL; } if (error == 0) { kda->kda_encryptedkey = encryptedkey; error = g_dev_setdumpdev(dev, kda, td); } if (encryptedkey != NULL) { explicit_bzero(encryptedkey, kda->kda_encryptedkeysize); free(encryptedkey, M_TEMP); } explicit_bzero(kda, sizeof(*kda)); break; } case DIOCGFLUSH: error = g_io_flush(cp); break; case DIOCGDELETE: offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % cp->provider->sectorsize) != 0 || (length % cp->provider->sectorsize) != 0 || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = EINVAL; break; } while (length > 0) { chunk = length; if (g_dev_del_max_sectors != 0 && chunk > g_dev_del_max_sectors * cp->provider->sectorsize) { chunk = g_dev_del_max_sectors * cp->provider->sectorsize; if (cp->provider->stripesize > 0) { odd = (offset + chunk + cp->provider->stripeoffset) % cp->provider->stripesize; if (chunk > odd) chunk -= odd; } } error = g_delete_data(cp, offset, chunk); length -= chunk; offset += chunk; if (error) break; /* * Since the request size can be large, the service * time can be is likewise. We make this ioctl * interruptible by checking for signals for each bio. */ if (SIGPENDING(td)) break; } break; case DIOCGIDENT: error = g_io_getattr("GEOM::ident", cp, &i, data); break; case DIOCGPROVIDERNAME: if (pp == NULL) return (ENOENT); strlcpy(data, pp->name, i); break; case DIOCGSTRIPESIZE: *(off_t *)data = cp->provider->stripesize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = cp->provider->stripeoffset; break; case DIOCGPHYSPATH: error = g_io_getattr("GEOM::physpath", cp, &i, data); if (error == 0 && *(char *)data == '\0') error = ENOENT; break; case DIOCGATTR: { struct diocgattr_arg *arg = (struct diocgattr_arg *)data; if (arg->len > sizeof(arg->value)) { error = EINVAL; break; } error = g_io_getattr(arg->name, cp, &arg->len, &arg->value); break; } case DIOCZONECMD: { struct disk_zone_args *zone_args =(struct disk_zone_args *)data; struct disk_zone_rep_entry *new_entries, *old_entries; struct disk_zone_report *rep; size_t alloc_size; old_entries = NULL; new_entries = NULL; rep = NULL; alloc_size = 0; if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) { rep = &zone_args->zone_params.report; alloc_size = rep->entries_allocated * sizeof(struct disk_zone_rep_entry); if (alloc_size != 0) new_entries = g_malloc(alloc_size, M_WAITOK| M_ZERO); old_entries = rep->entries; rep->entries = new_entries; } error = g_io_zonecmd(zone_args, cp); if ((zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) && (alloc_size != 0) && (error == 0)) { error = copyout(new_entries, old_entries, alloc_size); } if ((old_entries != NULL) && (rep != NULL)) rep->entries = old_entries; if (new_entries != NULL) g_free(new_entries); break; } default: if (cp->provider->geom->ioctl != NULL) { error = cp->provider->geom->ioctl(cp->provider, cmd, data, fflag, td); } else { error = ENOIOCTL; } } return (error); } static void g_dev_done(struct bio *bp2) { struct g_consumer *cp; struct g_dev_softc *sc; struct bio *bp; int destroy; cp = bp2->bio_from; sc = cp->private; bp = bp2->bio_parent; bp->bio_error = bp2->bio_error; bp->bio_completed = bp2->bio_completed; bp->bio_resid = bp->bio_length - bp2->bio_completed; if (bp2->bio_cmd == BIO_ZONE) bcopy(&bp2->bio_zone, &bp->bio_zone, sizeof(bp->bio_zone)); if (bp2->bio_error != 0) { g_trace(G_T_BIO, "g_dev_done(%p) had error %d", bp2, bp2->bio_error); bp->bio_flags |= BIO_ERROR; } else { g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd", bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed); } g_destroy_bio(bp2); destroy = 0; mtx_lock(&sc->sc_mtx); if ((--sc->sc_active) == 0) { if (sc->sc_open == 0) wakeup(&sc->sc_active); if (sc->sc_dev == NULL) destroy = 1; } mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL); biodone(bp); } static void g_dev_strategy(struct bio *bp) { struct g_consumer *cp; struct bio *bp2; struct cdev *dev; struct g_dev_softc *sc; KASSERT(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE || bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_ZONE, ("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd)); dev = bp->bio_dev; cp = dev->si_drv2; sc = cp->private; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_strategy")); biotrack(bp, __func__); #ifdef INVARIANTS if ((bp->bio_offset % cp->provider->sectorsize) != 0 || (bp->bio_bcount % cp->provider->sectorsize) != 0) { bp->bio_resid = bp->bio_bcount; biofinish(bp, NULL, EINVAL); return; } #endif mtx_lock(&sc->sc_mtx); KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy")); sc->sc_active++; mtx_unlock(&sc->sc_mtx); for (;;) { /* * XXX: This is not an ideal solution, but I believe it to * XXX: deadlock safely, all things considered. */ bp2 = g_clone_bio(bp); if (bp2 != NULL) break; pause("gdstrat", hz / 10); } KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place")); bp2->bio_done = g_dev_done; g_trace(G_T_BIO, "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d", bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length, bp2->bio_data, bp2->bio_cmd); g_io_request(bp2, cp); KASSERT(cp->acr || cp->acw, ("g_dev_strategy raced with g_dev_close and lost")); } /* * g_dev_callback() * * Called by devfs when asynchronous device destruction is completed. * - Mark that we have no attached device any more. * - If there are no outstanding requests, schedule geom destruction. * Otherwise destruction will be scheduled later by g_dev_done(). */ static void g_dev_callback(void *arg) { struct g_consumer *cp; struct g_dev_softc *sc; int destroy; cp = arg; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name); mtx_lock(&sc->sc_mtx); sc->sc_dev = NULL; sc->sc_alias = NULL; destroy = (sc->sc_active == 0); mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_dev_destroy, cp, M_WAITOK, NULL); } /* * g_dev_orphan() * * Called from below when the provider orphaned us. * - Clear any dump settings. * - Request asynchronous device destruction to prevent any more requests * from coming in. The provider is already marked with an error, so * anything which comes in the interim will be returned immediately. */ static void g_dev_orphan(struct g_consumer *cp) { struct cdev *dev; struct g_dev_softc *sc; g_topology_assert(); sc = cp->private; dev = sc->sc_dev; g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name); /* Reset any dump-area set on this device */ if (dev->si_flags & SI_DUMPDEV) (void)set_dumper(NULL, NULL, curthread, 0, 0, NULL, 0, NULL); /* Destroy the struct cdev *so we get no more requests */ destroy_dev_sched_cb(dev, g_dev_callback, cp); } DECLARE_GEOM_CLASS(g_dev_class, g_dev); Index: head/sys/geom/geom_disk.c =================================================================== --- head/sys/geom/geom_disk.c (revision 326269) +++ head/sys/geom/geom_disk.c (revision 326270) @@ -1,1067 +1,1069 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_disk_softc { struct mtx done_mtx; struct disk *dp; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; char led[64]; uint32_t state; struct mtx start_mtx; }; static g_access_t g_disk_access; static g_start_t g_disk_start; static g_ioctl_t g_disk_ioctl; static g_dumpconf_t g_disk_dumpconf; static g_provgone_t g_disk_providergone; static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS); static struct g_class g_disk_class = { .name = G_DISK_CLASS_NAME, .version = G_VERSION, .start = g_disk_start, .access = g_disk_access, .ioctl = g_disk_ioctl, .providergone = g_disk_providergone, .dumpconf = g_disk_dumpconf, }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0, "GEOM_DISK stuff"); DECLARE_GEOM_CLASS(g_disk_class, g_disk); static int g_disk_access(struct g_provider *pp, int r, int w, int e) { struct disk *dp; struct g_disk_softc *sc; int error; g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)", pp->name, r, w, e); g_topology_assert(); sc = pp->private; if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) { /* * Allow decreasing access count even if disk is not * available anymore. */ if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; error = 0; if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { if (dp->d_open != NULL) { error = dp->d_open(dp); if (bootverbose && error != 0) printf("Opened disk %s -> %d\n", pp->name, error); if (error != 0) return (error); } pp->sectorsize = dp->d_sectorsize; if (dp->d_maxsize == 0) { printf("WARNING: Disk drive %s%d has no d_maxsize\n", dp->d_name, dp->d_unit); dp->d_maxsize = DFLTPHYS; } if (dp->d_delmaxsize == 0) { if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) { printf("WARNING: Disk drive %s%d has no " "d_delmaxsize\n", dp->d_name, dp->d_unit); } dp->d_delmaxsize = dp->d_maxsize; } pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; dp->d_flags |= DISKFLAG_OPEN; /* * Do not invoke resize event when initial size was zero. * Some disks report its size only after first opening. */ if (pp->mediasize == 0) pp->mediasize = dp->d_mediasize; else g_resize_provider(pp, dp->d_mediasize); } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { if (dp->d_close != NULL) { error = dp->d_close(dp); if (error != 0) printf("Closed disk %s -> %d\n", pp->name, error); } sc->state = G_STATE_ACTIVE; if (sc->led[0] != 0) led_set(sc->led, "0"); dp->d_flags &= ~DISKFLAG_OPEN; } return (error); } static void g_disk_kerneldump(struct bio *bp, struct disk *dp) { struct g_kerneldump *gkd; struct g_geom *gp; gkd = (struct g_kerneldump*)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)", gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); if (dp->d_dump == NULL) { g_io_deliver(bp, ENODEV); return; } gkd->di.dumper = dp->d_dump; gkd->di.priv = dp; gkd->di.blocksize = dp->d_sectorsize; gkd->di.maxiosize = dp->d_maxsize; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > dp->d_mediasize) gkd->length = dp->d_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_disk_setstate(struct bio *bp, struct g_disk_softc *sc) { const char *cmd; memcpy(&sc->state, bp->bio_data, sizeof(sc->state)); if (sc->led[0] != 0) { switch (sc->state) { case G_STATE_FAILED: cmd = "1"; break; case G_STATE_REBUILD: cmd = "f5"; break; case G_STATE_RESYNC: cmd = "f1"; break; default: cmd = "0"; break; } led_set(sc->led, cmd); } g_io_deliver(bp, 0); } static void g_disk_done(struct bio *bp) { struct bintime now; struct bio *bp2; struct g_disk_softc *sc; /* See "notes" for why we need a mutex here */ /* XXX: will witness accept a mix of Giant/unGiant drivers here ? */ bp2 = bp->bio_parent; sc = bp2->bio_to->private; bp->bio_completed = bp->bio_length - bp->bio_resid; binuptime(&now); mtx_lock(&sc->done_mtx); if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; switch (bp->bio_cmd) { case BIO_ZONE: bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /*FALLTHROUGH*/ case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: devstat_end_transaction_bio_bt(sc->dp->d_devstat, bp, &now); break; default: break; } bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) { mtx_unlock(&sc->done_mtx); bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed; g_io_deliver(bp2, bp2->bio_error); } else mtx_unlock(&sc->done_mtx); g_destroy_bio(bp); } static int g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td) { struct disk *dp; struct g_disk_softc *sc; int error; sc = pp->private; dp = sc->dp; if (dp->d_ioctl == NULL) return (ENOIOCTL); error = dp->d_ioctl(dp, cmd, data, fflag, td); return (error); } static off_t g_disk_maxsize(struct disk *dp, struct bio *bp) { if (bp->bio_cmd == BIO_DELETE) return (dp->d_delmaxsize); return (dp->d_maxsize); } static int g_disk_maxsegs(struct disk *dp, struct bio *bp) { return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1); } static void g_disk_advance(struct disk *dp, struct bio *bp, off_t off) { bp->bio_offset += off; bp->bio_length -= off; if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *seg, *end; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; off += bp->bio_ma_offset; while (off >= seg->ds_len) { KASSERT((seg != end), ("vlist request runs off the end")); off -= seg->ds_len; seg++; } bp->bio_ma_offset = off; bp->bio_ma_n = end - seg; bp->bio_data = (void *)seg; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma += off / PAGE_SIZE; bp->bio_ma_offset += off; bp->bio_ma_offset %= PAGE_SIZE; bp->bio_ma_n -= off / PAGE_SIZE; } else { bp->bio_data += off; } } static void g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset, off_t *plength, int *ppages) { uintptr_t seg_page_base; uintptr_t seg_page_end; off_t offset; off_t length; int seg_pages; offset = *poffset; length = *plength; if (length > seg->ds_len - offset) length = seg->ds_len - offset; seg_page_base = trunc_page(seg->ds_addr + offset); seg_page_end = round_page(seg->ds_addr + offset + length); seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT; if (seg_pages > *ppages) { seg_pages = *ppages; length = (seg_page_base + (seg_pages << PAGE_SHIFT)) - (seg->ds_addr + offset); } *poffset = 0; *plength -= length; *ppages -= seg_pages; } static off_t g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg) { bus_dma_segment_t *seg, *end; off_t residual; off_t offset; int pages; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; residual = bp->bio_length; offset = bp->bio_ma_offset; pages = g_disk_maxsegs(dp, bp); while (residual != 0 && pages != 0) { KASSERT((seg != end), ("vlist limit runs off the end")); g_disk_seg_limit(seg, &offset, &residual, &pages); seg++; } if (pendseg != NULL) *pendseg = seg; return (residual); } static bool g_disk_limit(struct disk *dp, struct bio *bp) { bool limited = false; off_t maxsz; maxsz = g_disk_maxsize(dp, bp); /* * XXX: If we have a stripesize we should really use it here. * Care should be taken in the delete case if this is done * as deletes can be very sensitive to size given how they * are processed. */ if (bp->bio_length > maxsz) { bp->bio_length = maxsz; limited = true; } if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *firstseg, *endseg; off_t residual; firstseg = (bus_dma_segment_t*)bp->bio_data; residual = g_disk_vlist_limit(dp, bp, &endseg); if (residual != 0) { bp->bio_ma_n = endseg - firstseg; bp->bio_length -= residual; limited = true; } } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE); } return (limited); } static void g_disk_start(struct bio *bp) { struct bio *bp2, *bp3; struct disk *dp; struct g_disk_softc *sc; int error; off_t off; biotrack(bp, __func__); sc = bp->bio_to->private; if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) { g_io_deliver(bp, ENXIO); return; } error = EJUSTRETURN; switch(bp->bio_cmd) { case BIO_DELETE: if (!(dp->d_flags & DISKFLAG_CANDELETE)) { error = EOPNOTSUPP; break; } /* fall-through */ case BIO_READ: case BIO_WRITE: KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0, ("unmapped bio not supported by disk %s", dp->d_name)); off = 0; bp3 = NULL; bp2 = g_clone_bio(bp); if (bp2 == NULL) { error = ENOMEM; break; } for (;;) { if (g_disk_limit(dp, bp2)) { off += bp2->bio_length; /* * To avoid a race, we need to grab the next bio * before we schedule this one. See "notes". */ bp3 = g_clone_bio(bp); if (bp3 == NULL) bp->bio_error = ENOMEM; } bp2->bio_done = g_disk_done; bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize; bp2->bio_bcount = bp2->bio_length; bp2->bio_disk = dp; mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); if (bp3 == NULL) break; bp2 = bp3; bp3 = NULL; g_disk_advance(dp, bp2, off); } break; case BIO_GETATTR: /* Give the driver a chance to override */ if (dp->d_getattr != NULL) { if (bp->bio_disk == NULL) bp->bio_disk = dp; error = dp->d_getattr(bp); if (error != -1) break; error = EJUSTRETURN; } if (g_handleattr_int(bp, "GEOM::candelete", (dp->d_flags & DISKFLAG_CANDELETE) != 0)) break; else if (g_handleattr_int(bp, "GEOM::fwsectors", dp->d_fwsectors)) break; else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads)) break; else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0)) break; else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident)) break; else if (g_handleattr_str(bp, "GEOM::descr", dp->d_descr)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor", dp->d_hba_vendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_device", dp->d_hba_device)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor", dp->d_hba_subvendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice", dp->d_hba_subdevice)) break; else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_disk_kerneldump(bp, dp); else if (!strcmp(bp->bio_attribute, "GEOM::setstate")) g_disk_setstate(bp, sc); else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate", dp->d_rotation_rate)) break; else error = ENOIOCTL; break; case BIO_FLUSH: g_trace(G_T_BIO, "g_disk_flushcache(%s)", bp->bio_to->name); if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) { error = EOPNOTSUPP; break; } /*FALLTHROUGH*/ case BIO_ZONE: if (bp->bio_cmd == BIO_ZONE) { if (!(dp->d_flags & DISKFLAG_CANZONE)) { error = EOPNOTSUPP; break; } g_trace(G_T_BIO, "g_disk_zone(%s)", bp->bio_to->name); } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_disk = dp; mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); break; default: error = EOPNOTSUPP; break; } if (error != EJUSTRETURN) g_io_deliver(bp, error); return; } static void g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct bio *bp; struct disk *dp; struct g_disk_softc *sc; char *buf; int res = 0; sc = gp->softc; if (sc == NULL || (dp = sc->dp) == NULL) return; if (indent == NULL) { sbuf_printf(sb, " hd %u", dp->d_fwheads); sbuf_printf(sb, " sc %u", dp->d_fwsectors); return; } if (pp != NULL) { sbuf_printf(sb, "%s%u\n", indent, dp->d_fwheads); sbuf_printf(sb, "%s%u\n", indent, dp->d_fwsectors); /* * "rotationrate" is a little complicated, because the value * returned by the drive might not be the RPM; 0 and 1 are * special cases, and there's also a valid range. */ sbuf_printf(sb, "%s", indent); if (dp->d_rotation_rate == DISK_RR_UNKNOWN) /* Old drives */ sbuf_printf(sb, "unknown"); /* don't report RPM. */ else if (dp->d_rotation_rate == DISK_RR_NON_ROTATING) sbuf_printf(sb, "0"); else if ((dp->d_rotation_rate >= DISK_RR_MIN) && (dp->d_rotation_rate <= DISK_RR_MAX)) sbuf_printf(sb, "%u", dp->d_rotation_rate); else sbuf_printf(sb, "invalid"); sbuf_printf(sb, "\n"); if (dp->d_getattr != NULL) { buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK); bp = g_alloc_bio(); bp->bio_disk = dp; bp->bio_attribute = "GEOM::ident"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; res = dp->d_getattr(bp); sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", res == 0 ? buf: dp->d_ident); sbuf_printf(sb, "\n"); bp->bio_attribute = "GEOM::lunid"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", buf); sbuf_printf(sb, "\n"); } bp->bio_attribute = "GEOM::lunname"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", buf); sbuf_printf(sb, "\n"); } g_destroy_bio(bp); g_free(buf); } else { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", dp->d_ident); sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", dp->d_descr); sbuf_printf(sb, "\n"); } } static void g_disk_resize(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_provider *pp; if (flag == EV_CANCEL) return; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (dp->d_destroyed || gp == NULL) return; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->sectorsize != 0 && pp->sectorsize != dp->d_sectorsize) g_wither_provider(pp, ENXIO); else g_resize_provider(pp, dp->d_mediasize); } } static void g_disk_create(void *arg, int flag) { struct g_geom *gp; struct g_provider *pp; struct disk *dp; struct g_disk_softc *sc; struct disk_alias *dap; char tmpstr[80]; if (flag == EV_CANCEL) return; g_topology_assert(); dp = arg; mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_START; /* * If the disk has already gone away, we can just stop here and * call the user's callback to tell him we've cleaned things up. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); if (dp->d_gone != NULL) dp->d_gone(dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->start_mtx, "g_disk_start", NULL, MTX_DEF); mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF); sc->dp = dp; gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit); gp->softc = sc; LIST_FOREACH(dap, &dp->d_aliases, da_next) { snprintf(tmpstr, sizeof(tmpstr), "%s%d", dap->da_alias, dp->d_unit); g_geom_add_alias(gp, tmpstr); } pp = g_new_providerf(gp, "%s", gp->name); devstat_remove_entry(pp->stat); pp->stat = NULL; dp->d_devstat->id = pp; pp->mediasize = dp->d_mediasize; pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0) pp->flags |= G_PF_ACCEPT_UNMAPPED; if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0) pp->flags |= G_PF_DIRECT_SEND; pp->flags |= G_PF_DIRECT_RECEIVE; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name); sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name, CTLFLAG_RD, 0, tmpstr); if (sc->sysctl_tree != NULL) { SYSCTL_ADD_STRING(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led", CTLFLAG_RWTUN, sc->led, sizeof(sc->led), "LED name"); SYSCTL_ADD_PROC(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "flags", CTLTYPE_STRING | CTLFLAG_RD, dp, 0, g_disk_sysctl_flags, "A", "Report disk flags"); } pp->private = sc; dp->d_geom = gp; g_error_provider(pp, 0); mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_DONE; /* * If the disk has gone away at this stage, start the withering * process for it. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); g_wither_provider(pp, ENXIO); return; } mtx_pool_unlock(mtxpool_sleep, dp); } /* * We get this callback after all of the consumers have gone away, and just * before the provider is freed. If the disk driver provided a d_gone * callback, let them know that it is okay to free resources -- they won't * be getting any more accesses from GEOM. */ static void g_disk_providergone(struct g_provider *pp) { struct disk *dp; struct g_disk_softc *sc; sc = (struct g_disk_softc *)pp->private; dp = sc->dp; if (dp != NULL && dp->d_gone != NULL) dp->d_gone(dp); if (sc->sysctl_tree != NULL) { sysctl_ctx_free(&sc->sysctl_ctx); sc->sysctl_tree = NULL; } if (sc->led[0] != 0) { led_set(sc->led, "0"); sc->led[0] = 0; } pp->private = NULL; pp->geom->softc = NULL; mtx_destroy(&sc->done_mtx); mtx_destroy(&sc->start_mtx); g_free(sc); } static void g_disk_destroy(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_disk_softc *sc; struct disk_alias *dap, *daptmp; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (gp != NULL) { sc = gp->softc; if (sc != NULL) sc->dp = NULL; dp->d_geom = NULL; g_wither_geom(gp, ENXIO); } LIST_FOREACH_SAFE(dap, &dp->d_aliases, da_next, daptmp) g_free(dap); g_free(dp); } /* * We only allow printable characters in disk ident, * the rest is converted to 'x'. */ static void g_disk_ident_adjust(char *ident, size_t size) { char *p, tmp[4], newid[DISK_IDENT_SIZE]; newid[0] = '\0'; for (p = ident; *p != '\0'; p++) { if (isprint(*p)) { tmp[0] = *p; tmp[1] = '\0'; } else { snprintf(tmp, sizeof(tmp), "x%02hhx", *(unsigned char *)p); } if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid)) break; } bzero(ident, size); strlcpy(ident, newid, size); } struct disk * disk_alloc(void) { struct disk *dp; dp = g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO); LIST_INIT(&dp->d_aliases); return (dp); } void disk_create(struct disk *dp, int version) { if (version != DISK_VERSION) { printf("WARNING: Attempt to add disk %s%d %s", dp->d_name, dp->d_unit, " using incompatible ABI version of disk(9)\n"); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } if (dp->d_flags & DISKFLAG_RESERVED) { printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n", dp->d_name, dp->d_unit); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy")); KASSERT(dp->d_name != NULL, ("disk_create need d_name")); KASSERT(*dp->d_name != 0, ("disk_create need d_name")); KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long")); if (dp->d_devstat == NULL) dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit, dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); dp->d_geom = NULL; dp->d_init_level = DISK_INIT_NONE; g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident)); g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL); } void disk_destroy(struct disk *dp) { g_cancel_event(dp); dp->d_destroyed = 1; if (dp->d_devstat != NULL) devstat_remove_entry(dp->d_devstat); g_post_event(g_disk_destroy, dp, M_WAITOK, NULL); } void disk_add_alias(struct disk *dp, const char *name) { struct disk_alias *dap; dap = (struct disk_alias *)g_malloc( sizeof(struct disk_alias) + strlen(name) + 1, M_WAITOK); strcpy((char *)(dap + 1), name); dap->da_alias = (const char *)(dap + 1); LIST_INSERT_HEAD(&dp->d_aliases, dap, da_next); } void disk_gone(struct disk *dp) { struct g_geom *gp; struct g_provider *pp; mtx_pool_lock(mtxpool_sleep, dp); dp->d_goneflag = 1; /* * If we're still in the process of creating this disk (the * g_disk_create() function is still queued, or is in * progress), the init level will not yet be DISK_INIT_DONE. * * If that is the case, g_disk_create() will see d_goneflag * and take care of cleaning things up. * * If the disk has already been created, we default to * withering the provider as usual below. * * If the caller has not set a d_gone() callback, he will * not be any worse off by returning here, because the geom * has not been fully setup in any case. */ if (dp->d_init_level < DISK_INIT_DONE) { mtx_pool_unlock(mtxpool_sleep, dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_wither_provider(pp, ENXIO); } } } void disk_attr_changed(struct disk *dp, const char *attr, int flag) { struct g_geom *gp; struct g_provider *pp; char devnamebuf[128]; gp = dp->d_geom; if (gp != NULL) LIST_FOREACH(pp, &gp->provider, provider) (void)g_attr_changed(pp, attr, flag); snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name, dp->d_unit); devctl_notify("GEOM", "disk", attr, devnamebuf); } void disk_media_changed(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_changed(pp, flag); } } } void disk_media_gone(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_gone(pp, flag); } } } int disk_resize(struct disk *dp, int flag) { if (dp->d_destroyed || dp->d_geom == NULL) return (0); return (g_post_event(g_disk_resize, dp, flag, NULL)); } static void g_kern_disks(void *p, int flag __unused) { struct sbuf *sb; struct g_geom *gp; char *sp; sb = p; sp = ""; g_topology_assert(); LIST_FOREACH(gp, &g_disk_class.geom, geom) { sbuf_printf(sb, "%s%s", sp, gp->name); sp = " "; } sbuf_finish(sb); } static int g_disk_sysctl_flags(SYSCTL_HANDLER_ARGS) { struct disk *dp; struct sbuf *sb; int error; sb = sbuf_new_auto(); dp = (struct disk *)arg1; sbuf_printf(sb, "%b", dp->d_flags, "\20" "\2OPEN" "\3CANDELETE" "\4CANFLUSHCACHE" "\5UNMAPPEDBIO" "\6DIRECTCOMPLETION" "\10CANZONE"); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } static int sysctl_disks(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_disks, "A", "names of available disks"); Index: head/sys/geom/geom_disk.h =================================================================== --- head/sys/geom/geom_disk.h (revision 326269) +++ head/sys/geom/geom_disk.h (revision 326270) @@ -1,154 +1,156 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2003 Poul-Henning Kamp * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_DISK_H_ #define _GEOM_GEOM_DISK_H_ #define DISK_RR_UNKNOWN 0 #define DISK_RR_NON_ROTATING 1 #define DISK_RR_MIN 0x0401 #define DISK_RR_MAX 0xfffe #ifdef _KERNEL #include #include #include #include #define G_DISK_CLASS_NAME "DISK" struct disk; typedef int disk_open_t(struct disk *); typedef int disk_close_t(struct disk *); typedef void disk_strategy_t(struct bio *bp); typedef int disk_getattr_t(struct bio *bp); typedef void disk_gone_t(struct disk *); typedef int disk_ioctl_t(struct disk *, u_long cmd, void *data, int fflag, struct thread *td); /* NB: disk_ioctl_t SHALL be cast'able to d_ioctl_t */ struct g_geom; struct devstat; typedef enum { DISK_INIT_NONE, DISK_INIT_START, DISK_INIT_DONE } disk_init_level; struct disk_alias { LIST_ENTRY(disk_alias) da_next; const char *da_alias; }; struct disk { /* Fields which are private to geom_disk */ struct g_geom *d_geom; struct devstat *d_devstat; int d_goneflag; int d_destroyed; disk_init_level d_init_level; /* Shared fields */ u_int d_flags; const char *d_name; u_int d_unit; struct bio_queue_head *d_queue; struct mtx *d_lock; /* Disk methods */ disk_open_t *d_open; disk_close_t *d_close; disk_strategy_t *d_strategy; disk_ioctl_t *d_ioctl; dumper_t *d_dump; disk_getattr_t *d_getattr; disk_gone_t *d_gone; /* Info fields from driver to geom_disk.c. Valid when open */ u_int d_sectorsize; off_t d_mediasize; u_int d_fwsectors; u_int d_fwheads; u_int d_maxsize; off_t d_delmaxsize; u_int d_stripeoffset; u_int d_stripesize; char d_ident[DISK_IDENT_SIZE]; char d_descr[DISK_IDENT_SIZE]; uint16_t d_hba_vendor; uint16_t d_hba_device; uint16_t d_hba_subvendor; uint16_t d_hba_subdevice; uint16_t d_rotation_rate; /* Fields private to the driver */ void *d_drv1; /* Fields private to geom_disk, to be moved on next version bump */ LIST_HEAD(,disk_alias) d_aliases; }; #define DISKFLAG_RESERVED 0x1 /* Was NEEDSGIANT */ #define DISKFLAG_OPEN 0x2 #define DISKFLAG_CANDELETE 0x4 #define DISKFLAG_CANFLUSHCACHE 0x8 #define DISKFLAG_UNMAPPED_BIO 0x10 #define DISKFLAG_DIRECT_COMPLETION 0x20 #define DISKFLAG_CANZONE 0x80 struct disk *disk_alloc(void); void disk_create(struct disk *disk, int version); void disk_destroy(struct disk *disk); void disk_gone(struct disk *disk); void disk_attr_changed(struct disk *dp, const char *attr, int flag); void disk_media_changed(struct disk *dp, int flag); void disk_media_gone(struct disk *dp, int flag); int disk_resize(struct disk *dp, int flag); void disk_add_alias(struct disk *disk, const char *); #define DISK_VERSION_00 0x58561059 #define DISK_VERSION_01 0x5856105a #define DISK_VERSION_02 0x5856105b #define DISK_VERSION_03 0x5856105c #define DISK_VERSION_04 0x5856105d #define DISK_VERSION_05 0x5856105e #define DISK_VERSION DISK_VERSION_05 #endif /* _KERNEL */ #endif /* _GEOM_GEOM_DISK_H_ */ Index: head/sys/geom/geom_dump.c =================================================================== --- head/sys/geom/geom_dump.c (revision 326269) +++ head/sys/geom/geom_dump.c (revision 326270) @@ -1,324 +1,326 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static void g_confdot_consumer(struct sbuf *sb, struct g_consumer *cp) { sbuf_printf(sb, "z%p [label=\"r%dw%de%d\"];\n", cp, cp->acr, cp->acw, cp->ace); if (cp->provider) sbuf_printf(sb, "z%p -> z%p;\n", cp, cp->provider); } static void g_confdot_provider(struct sbuf *sb, struct g_provider *pp) { sbuf_printf(sb, "z%p [shape=hexagon,label=\"%s\\nr%dw%de%d\\nerr#%d\\n" "sector=%u\\nstripe=%u\"];\n", pp, pp->name, pp->acr, pp->acw, pp->ace, pp->error, pp->sectorsize, pp->stripesize); } static void g_confdot_geom(struct sbuf *sb, struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp; sbuf_printf(sb, "z%p [shape=box,label=\"%s\\n%s\\nr#%d\"];\n", gp, gp->class->name, gp->name, gp->rank); LIST_FOREACH(cp, &gp->consumer, consumer) { g_confdot_consumer(sb, cp); sbuf_printf(sb, "z%p -> z%p;\n", gp, cp); } LIST_FOREACH(pp, &gp->provider, provider) { g_confdot_provider(sb, pp); sbuf_printf(sb, "z%p -> z%p;\n", pp, gp); } } static void g_confdot_class(struct sbuf *sb, struct g_class *mp) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) g_confdot_geom(sb, gp); } void g_confdot(void *p, int flag ) { struct g_class *mp; struct sbuf *sb; KASSERT(flag != EV_CANCEL, ("g_confdot was cancelled")); sb = p; g_topology_assert(); sbuf_printf(sb, "digraph geom {\n"); LIST_FOREACH(mp, &g_classes, class) g_confdot_class(sb, mp); sbuf_printf(sb, "}\n"); sbuf_finish(sb); } static void g_conftxt_geom(struct sbuf *sb, struct g_geom *gp, int level) { struct g_provider *pp; struct g_consumer *cp; if (gp->flags & G_GEOM_WITHER) return; LIST_FOREACH(pp, &gp->provider, provider) { sbuf_printf(sb, "%d %s %s %ju %u", level, gp->class->name, pp->name, (uintmax_t)pp->mediasize, pp->sectorsize); if (gp->dumpconf != NULL) gp->dumpconf(sb, NULL, gp, NULL, pp); sbuf_printf(sb, "\n"); LIST_FOREACH(cp, &pp->consumers, consumers) g_conftxt_geom(sb, cp->geom, level + 1); } } static void g_conftxt_class(struct sbuf *sb, struct g_class *mp) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) g_conftxt_geom(sb, gp, 0); } void g_conftxt(void *p, int flag) { struct g_class *mp; struct sbuf *sb; KASSERT(flag != EV_CANCEL, ("g_conftxt was cancelled")); sb = p; g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { if (!strcmp(mp->name, G_DISK_CLASS_NAME) || !strcmp(mp->name, "MD")) g_conftxt_class(sb, mp); } sbuf_finish(sb); } void g_conf_printf_escaped(struct sbuf *sb, const char *fmt, ...) { struct sbuf *s; const u_char *c; va_list ap; s = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(s, fmt, ap); va_end(ap); sbuf_finish(s); for (c = sbuf_data(s); *c != '\0'; c++) { if (*c == '&' || *c == '<' || *c == '>' || *c == '\'' || *c == '"' || *c > 0x7e) sbuf_printf(sb, "&#x%X;", *c); else if (*c == '\t' || *c == '\n' || *c == '\r' || *c > 0x1f) sbuf_putc(sb, *c); else sbuf_putc(sb, '?'); } sbuf_delete(s); } static void g_conf_consumer(struct sbuf *sb, struct g_consumer *cp) { sbuf_printf(sb, "\t\n", cp); sbuf_printf(sb, "\t \n", cp->geom); if (cp->provider != NULL) sbuf_printf(sb, "\t \n", cp->provider); sbuf_printf(sb, "\t r%dw%de%d\n", cp->acr, cp->acw, cp->ace); if (cp->geom->flags & G_GEOM_WITHER) ; else if (cp->geom->dumpconf != NULL) { sbuf_printf(sb, "\t \n"); cp->geom->dumpconf(sb, "\t ", cp->geom, cp, NULL); sbuf_printf(sb, "\t \n"); } sbuf_printf(sb, "\t\n"); } static void g_conf_provider(struct sbuf *sb, struct g_provider *pp) { sbuf_printf(sb, "\t\n", pp); sbuf_printf(sb, "\t \n", pp->geom); sbuf_printf(sb, "\t r%dw%de%d\n", pp->acr, pp->acw, pp->ace); sbuf_printf(sb, "\t "); g_conf_printf_escaped(sb, "%s", pp->name); sbuf_printf(sb, "\n"); sbuf_printf(sb, "\t %jd\n", (intmax_t)pp->mediasize); sbuf_printf(sb, "\t %u\n", pp->sectorsize); sbuf_printf(sb, "\t %u\n", pp->stripesize); sbuf_printf(sb, "\t %u\n", pp->stripeoffset); if (pp->flags & G_PF_WITHER) sbuf_printf(sb, "\t \n"); else if (pp->geom->flags & G_GEOM_WITHER) ; else if (pp->geom->dumpconf != NULL) { sbuf_printf(sb, "\t \n"); pp->geom->dumpconf(sb, "\t ", pp->geom, NULL, pp); sbuf_printf(sb, "\t \n"); } sbuf_printf(sb, "\t\n"); } static void g_conf_geom(struct sbuf *sb, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) { struct g_consumer *cp2; struct g_provider *pp2; struct g_geom_alias *gap; sbuf_printf(sb, " \n", gp); sbuf_printf(sb, " \n", gp->class); sbuf_printf(sb, " "); g_conf_printf_escaped(sb, "%s", gp->name); sbuf_printf(sb, "\n"); sbuf_printf(sb, " %d\n", gp->rank); if (gp->flags & G_GEOM_WITHER) sbuf_printf(sb, " \n"); else if (gp->dumpconf != NULL) { sbuf_printf(sb, " \n"); gp->dumpconf(sb, "\t", gp, NULL, NULL); sbuf_printf(sb, " \n"); } LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp != NULL && cp != cp2) continue; g_conf_consumer(sb, cp2); } LIST_FOREACH(pp2, &gp->provider, provider) { if (pp != NULL && pp != pp2) continue; g_conf_provider(sb, pp2); } LIST_FOREACH(gap, &gp->aliases, ga_next) { sbuf_printf(sb, " \n"); g_conf_printf_escaped(sb, "%s", gap->ga_alias); sbuf_printf(sb, " \n"); } sbuf_printf(sb, " \n"); } static void g_conf_class(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) { struct g_geom *gp2; sbuf_printf(sb, " \n", mp); sbuf_printf(sb, " "); g_conf_printf_escaped(sb, "%s", mp->name); sbuf_printf(sb, "\n"); LIST_FOREACH(gp2, &mp->geom, geom) { if (gp != NULL && gp != gp2) continue; g_conf_geom(sb, gp2, pp, cp); } sbuf_printf(sb, " \n"); } void g_conf_specific(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) { struct g_class *mp2; g_topology_assert(); sbuf_printf(sb, "\n"); LIST_FOREACH(mp2, &g_classes, class) { if (mp != NULL && mp != mp2) continue; g_conf_class(sb, mp2, gp, pp, cp); } sbuf_printf(sb, "\n"); sbuf_finish(sb); } void g_confxml(void *p, int flag) { KASSERT(flag != EV_CANCEL, ("g_confxml was cancelled")); g_topology_assert(); g_conf_specific(p, NULL, NULL, NULL, NULL); } void g_trace(int level, const char *fmt, ...) { va_list ap; if (!(g_debugflags & level)) return; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } Index: head/sys/geom/geom_event.c =================================================================== --- head/sys/geom/geom_event.c (revision 326269) +++ head/sys/geom/geom_event.c (revision 326270) @@ -1,440 +1,442 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * XXX: How do we in general know that objects referenced in events * have not been destroyed before we get around to handle the event ? */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include TAILQ_HEAD(event_tailq_head, g_event); static struct event_tailq_head g_events = TAILQ_HEAD_INITIALIZER(g_events); static u_int g_pending_events; static TAILQ_HEAD(,g_provider) g_doorstep = TAILQ_HEAD_INITIALIZER(g_doorstep); static struct mtx g_eventlock; static int g_wither_work; #define G_N_EVENTREFS 20 struct g_event { TAILQ_ENTRY(g_event) events; g_event_t *func; void *arg; int flag; void *ref[G_N_EVENTREFS]; }; #define EV_DONE 0x80000 #define EV_WAKEUP 0x40000 #define EV_CANCELED 0x20000 #define EV_INPROGRESS 0x10000 void g_waitidle(void) { g_topology_assert_not(); mtx_lock(&g_eventlock); while (!TAILQ_EMPTY(&g_events)) msleep(&g_pending_events, &g_eventlock, PPAUSE, "g_waitidle", hz/5); mtx_unlock(&g_eventlock); curthread->td_pflags &= ~TDP_GEOM; } #if 0 void g_waitidlelock(void) { g_topology_assert(); mtx_lock(&g_eventlock); while (!TAILQ_EMPTY(&g_events)) { g_topology_unlock(); msleep(&g_pending_events, &g_eventlock, PPAUSE, "g_waitidlel", hz/5); g_topology_lock(); } mtx_unlock(&g_eventlock); } #endif struct g_attrchanged_args { struct g_provider *pp; const char *attr; }; static void g_attr_changed_event(void *arg, int flag) { struct g_attrchanged_args *args; struct g_provider *pp; struct g_consumer *cp; struct g_consumer *next_cp; args = arg; pp = args->pp; g_topology_assert(); if (flag != EV_CANCEL && g_shutdown == 0) { /* * Tell all consumers of the change. */ LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { if (cp->geom->attrchanged != NULL) cp->geom->attrchanged(cp, args->attr); } } g_free(args); } int g_attr_changed(struct g_provider *pp, const char *attr, int flag) { struct g_attrchanged_args *args; int error; args = g_malloc(sizeof *args, flag); if (args == NULL) return (ENOMEM); args->pp = pp; args->attr = attr; error = g_post_event(g_attr_changed_event, args, flag, pp, NULL); if (error != 0) g_free(args); return (error); } void g_orphan_provider(struct g_provider *pp, int error) { /* G_VALID_PROVIDER(pp) We likely lack topology lock */ g_trace(G_T_TOPOLOGY, "g_orphan_provider(%p(%s), %d)", pp, pp->name, error); KASSERT(error != 0, ("g_orphan_provider(%p(%s), 0) error must be non-zero\n", pp, pp->name)); pp->error = error; mtx_lock(&g_eventlock); KASSERT(!(pp->flags & G_PF_ORPHAN), ("g_orphan_provider(%p(%s)), already an orphan", pp, pp->name)); pp->flags |= G_PF_ORPHAN; TAILQ_INSERT_TAIL(&g_doorstep, pp, orphan); mtx_unlock(&g_eventlock); wakeup(&g_wait_event); } /* * This function is called once on each provider which the event handler * finds on its g_doorstep. */ static void g_orphan_register(struct g_provider *pp) { struct g_consumer *cp, *cp2; int wf; g_topology_assert(); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_orphan_register(%s)", pp->name); g_cancel_event(pp); wf = pp->flags & G_PF_WITHER; pp->flags &= ~G_PF_WITHER; /* * Tell all consumers the bad news. * Don't be surprised if they self-destruct. */ LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { KASSERT(cp->geom->orphan != NULL, ("geom %s has no orphan, class %s", cp->geom->name, cp->geom->class->name)); /* * XXX: g_dev_orphan method does deferred destroying * and it is possible, that other event could already * call the orphan method. Check consumer's flags to * do not schedule it twice. */ if (cp->flags & G_CF_ORPHAN) continue; cp->flags |= G_CF_ORPHAN; cp->geom->orphan(cp); } if (LIST_EMPTY(&pp->consumers) && wf) g_destroy_provider(pp); else pp->flags |= wf; #ifdef notyet cp = LIST_FIRST(&pp->consumers); if (cp != NULL) return; if (pp->geom->flags & G_GEOM_WITHER) g_destroy_provider(pp); #endif } static int one_event(void) { struct g_event *ep; struct g_provider *pp; g_topology_assert(); mtx_lock(&g_eventlock); TAILQ_FOREACH(pp, &g_doorstep, orphan) { if (pp->nstart == pp->nend) break; } if (pp != NULL) { G_VALID_PROVIDER(pp); TAILQ_REMOVE(&g_doorstep, pp, orphan); mtx_unlock(&g_eventlock); g_orphan_register(pp); return (1); } ep = TAILQ_FIRST(&g_events); if (ep == NULL) { wakeup(&g_pending_events); return (0); } if (ep->flag & EV_INPROGRESS) { mtx_unlock(&g_eventlock); return (1); } ep->flag |= EV_INPROGRESS; mtx_unlock(&g_eventlock); g_topology_assert(); ep->func(ep->arg, 0); g_topology_assert(); mtx_lock(&g_eventlock); TAILQ_REMOVE(&g_events, ep, events); ep->flag &= ~EV_INPROGRESS; if (ep->flag & EV_WAKEUP) { ep->flag |= EV_DONE; mtx_unlock(&g_eventlock); wakeup(ep); } else { mtx_unlock(&g_eventlock); g_free(ep); } return (1); } void g_run_events() { for (;;) { g_topology_lock(); while (one_event()) ; mtx_assert(&g_eventlock, MA_OWNED); if (g_wither_work) { g_wither_work = 0; mtx_unlock(&g_eventlock); g_wither_washer(); g_topology_unlock(); } else { g_topology_unlock(); msleep(&g_wait_event, &g_eventlock, PRIBIO | PDROP, "-", TAILQ_EMPTY(&g_doorstep) ? 0 : hz / 10); } } /* NOTREACHED */ } void g_cancel_event(void *ref) { struct g_event *ep, *epn; struct g_provider *pp; u_int n; mtx_lock(&g_eventlock); TAILQ_FOREACH(pp, &g_doorstep, orphan) { if (pp != ref) continue; TAILQ_REMOVE(&g_doorstep, pp, orphan); break; } TAILQ_FOREACH_SAFE(ep, &g_events, events, epn) { if (ep->flag & EV_INPROGRESS) continue; for (n = 0; n < G_N_EVENTREFS; n++) { if (ep->ref[n] == NULL) break; if (ep->ref[n] != ref) continue; TAILQ_REMOVE(&g_events, ep, events); ep->func(ep->arg, EV_CANCEL); mtx_assert(&g_eventlock, MA_OWNED); if (ep->flag & EV_WAKEUP) { ep->flag |= (EV_DONE|EV_CANCELED); wakeup(ep); } else { g_free(ep); } break; } } if (TAILQ_EMPTY(&g_events)) wakeup(&g_pending_events); mtx_unlock(&g_eventlock); } static int g_post_event_x(g_event_t *func, void *arg, int flag, int wuflag, struct g_event **epp, va_list ap) { struct g_event *ep; void *p; u_int n; g_trace(G_T_TOPOLOGY, "g_post_event_x(%p, %p, %d, %d)", func, arg, flag, wuflag); KASSERT(wuflag == 0 || wuflag == EV_WAKEUP, ("Wrong wuflag in g_post_event_x(0x%x)", wuflag)); ep = g_malloc(sizeof *ep, flag | M_ZERO); if (ep == NULL) return (ENOMEM); ep->flag = wuflag; for (n = 0; n < G_N_EVENTREFS; n++) { p = va_arg(ap, void *); if (p == NULL) break; g_trace(G_T_TOPOLOGY, " ref %p", p); ep->ref[n] = p; } KASSERT(p == NULL, ("Too many references to event")); ep->func = func; ep->arg = arg; mtx_lock(&g_eventlock); TAILQ_INSERT_TAIL(&g_events, ep, events); mtx_unlock(&g_eventlock); wakeup(&g_wait_event); if (epp != NULL) *epp = ep; curthread->td_pflags |= TDP_GEOM; return (0); } int g_post_event(g_event_t *func, void *arg, int flag, ...) { va_list ap; int i; KASSERT(flag == M_WAITOK || flag == M_NOWAIT, ("Wrong flag to g_post_event")); va_start(ap, flag); i = g_post_event_x(func, arg, flag, 0, NULL, ap); va_end(ap); return (i); } void g_do_wither() { mtx_lock(&g_eventlock); g_wither_work = 1; mtx_unlock(&g_eventlock); wakeup(&g_wait_event); } /* * XXX: It might actually be useful to call this function with topology held. * XXX: This would ensure that the event gets created before anything else * XXX: changes. At present all users have a handle on things in some other * XXX: way, so this remains an XXX for now. */ int g_waitfor_event(g_event_t *func, void *arg, int flag, ...) { va_list ap; struct g_event *ep; int error; g_topology_assert_not(); KASSERT(flag == M_WAITOK || flag == M_NOWAIT, ("Wrong flag to g_post_event")); va_start(ap, flag); error = g_post_event_x(func, arg, flag, EV_WAKEUP, &ep, ap); va_end(ap); if (error) return (error); mtx_lock(&g_eventlock); while (!(ep->flag & EV_DONE)) msleep(ep, &g_eventlock, PRIBIO, "g_waitfor_event", hz); if (ep->flag & EV_CANCELED) error = EAGAIN; mtx_unlock(&g_eventlock); g_free(ep); return (error); } void g_event_init() { mtx_init(&g_eventlock, "GEOM orphanage", NULL, MTX_DEF); } Index: head/sys/geom/geom_flashmap.c =================================================================== --- head/sys/geom/geom_flashmap.c (revision 326269) +++ head/sys/geom/geom_flashmap.c (revision 326270) @@ -1,247 +1,249 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Semihalf * Copyright (c) 2009 Jakub Klama * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #define FLASHMAP_CLASS_NAME "Flashmap" struct g_flashmap_slice { off_t sl_start; off_t sl_end; const char *sl_name; STAILQ_ENTRY(g_flashmap_slice) sl_link; }; STAILQ_HEAD(g_flashmap_head, g_flashmap_slice); static struct { const char *type; flash_slicer_t slicer; } g_flashmap_slicers[] = { { "NAND::device", NULL }, { "CFI::device", NULL }, { "SPI::device", NULL }, { "MMC::device", NULL } }; static g_ioctl_t g_flashmap_ioctl; static g_taste_t g_flashmap_taste; static int g_flashmap_load(device_t dev, struct g_provider *pp, flash_slicer_t slicer, struct g_flashmap_head *head); static int g_flashmap_modify(struct g_geom *gp, const char *devname, int secsize, struct g_flashmap_head *slices); static void g_flashmap_print(struct g_flashmap_slice *slice); MALLOC_DECLARE(M_FLASHMAP); MALLOC_DEFINE(M_FLASHMAP, "geom_flashmap", "GEOM flash memory slicer class"); static void g_flashmap_print(struct g_flashmap_slice *slice) { printf("%08jx-%08jx: %s (%juKB)\n", (uintmax_t)slice->sl_start, (uintmax_t)slice->sl_end, slice->sl_name, (uintmax_t)(slice->sl_end - slice->sl_start) / 1024); } static int g_flashmap_modify(struct g_geom *gp, const char *devname, int secsize, struct g_flashmap_head *slices) { struct g_flashmap_slice *slice; int i, error; g_topology_assert(); i = 0; STAILQ_FOREACH(slice, slices, sl_link) { if (bootverbose) { printf("%s: slice ", devname); g_flashmap_print(slice); } error = g_slice_config(gp, i++, G_SLICE_CONFIG_CHECK, slice->sl_start, slice->sl_end - slice->sl_start + 1, secsize, FLASH_SLICES_FMT, gp->name, slice->sl_name); if (error) return (error); } i = 0; STAILQ_FOREACH(slice, slices, sl_link) { error = g_slice_config(gp, i++, G_SLICE_CONFIG_SET, slice->sl_start, slice->sl_end - slice->sl_start + 1, secsize, "%ss.%s", gp->name, slice->sl_name); if (error) return (error); } return (0); } static int g_flashmap_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_geom *gp; if (cmd != NAND_IO_GET_CHIP_PARAM) return (ENOIOCTL); cp = LIST_FIRST(&pp->geom->consumer); if (cp == NULL) return (ENOIOCTL); gp = cp->provider->geom; if (gp->ioctl == NULL) return (ENOIOCTL); return (gp->ioctl(cp->provider, cmd, data, fflag, td)); } static struct g_geom * g_flashmap_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; struct g_flashmap_head head; struct g_flashmap_slice *slice, *slice_temp; flash_slicer_t slicer; device_t dev; int i, size; g_trace(G_T_TOPOLOGY, "flashmap_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (flags == G_TF_NORMAL && strcmp(pp->geom->class->name, G_DISK_CLASS_NAME) != 0) return (NULL); gp = g_slice_new(mp, FLASH_SLICES_MAX_NUM, pp, &cp, NULL, 0, NULL); if (gp == NULL) return (NULL); STAILQ_INIT(&head); do { slicer = NULL; for (i = 0; i < nitems(g_flashmap_slicers); i++) { size = sizeof(device_t); if (g_io_getattr(g_flashmap_slicers[i].type, cp, &size, &dev) == 0) { slicer = g_flashmap_slicers[i].slicer; break; } } if (slicer == NULL) break; if (g_flashmap_load(dev, pp, slicer, &head) == 0) break; g_flashmap_modify(gp, cp->provider->name, cp->provider->sectorsize, &head); } while (0); g_access(cp, -1, 0, 0); STAILQ_FOREACH_SAFE(slice, &head, sl_link, slice_temp) free(slice, M_FLASHMAP); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static int g_flashmap_load(device_t dev, struct g_provider *pp, flash_slicer_t slicer, struct g_flashmap_head *head) { struct flash_slice *slices; struct g_flashmap_slice *slice; int i, nslices = 0; slices = malloc(sizeof(struct flash_slice) * FLASH_SLICES_MAX_NUM, M_FLASHMAP, M_WAITOK | M_ZERO); if (slicer(dev, pp->name, slices, &nslices) == 0) { for (i = 0; i < nslices; i++) { slice = malloc(sizeof(struct g_flashmap_slice), M_FLASHMAP, M_WAITOK); slice->sl_name = slices[i].label; slice->sl_start = slices[i].base; slice->sl_end = slices[i].base + slices[i].size - 1; STAILQ_INSERT_TAIL(head, slice, sl_link); } } free(slices, M_FLASHMAP); return (nslices); } void flash_register_slicer(flash_slicer_t slicer, u_int type, bool force) { g_topology_lock(); if (g_flashmap_slicers[type].slicer == NULL || force == TRUE) g_flashmap_slicers[type].slicer = slicer; g_topology_unlock(); } static struct g_class g_flashmap_class = { .name = FLASHMAP_CLASS_NAME, .version = G_VERSION, .taste = g_flashmap_taste, .ioctl = g_flashmap_ioctl, }; DECLARE_GEOM_CLASS(g_flashmap_class, g_flashmap); MODULE_VERSION(g_flashmap, 0); Index: head/sys/geom/geom_fox.c =================================================================== --- head/sys/geom/geom_fox.c (revision 326269) +++ head/sys/geom/geom_fox.c (revision 326270) @@ -1,485 +1,487 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* This is a GEOM module for handling path selection for multi-path * storage devices. It is named "fox" because it, like they, prefer * to have multiple exits to choose from. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define FOX_CLASS_NAME "FOX" #define FOX_MAGIC "GEOM::FOX" static int g_fox_once; FEATURE(geom_fox, "GEOM FOX redundant path mitigation support"); struct g_fox_softc { off_t mediasize; u_int sectorsize; TAILQ_HEAD(, bio) queue; struct mtx lock; u_char magic[16]; struct g_consumer *path; struct g_consumer *opath; int waiting; int cr, cw, ce; }; /* * This function is called whenever we need to select a new path. */ static void g_fox_select_path(void *arg, int flag) { struct g_geom *gp; struct g_fox_softc *sc; struct g_consumer *cp1; struct bio *bp; int error; g_topology_assert(); if (flag == EV_CANCEL) return; gp = arg; sc = gp->softc; if (sc->opath != NULL) { /* * First, close the old path entirely. */ printf("Closing old path (%s) on fox (%s)\n", sc->opath->provider->name, gp->name); cp1 = LIST_NEXT(sc->opath, consumer); g_access(sc->opath, -sc->cr, -sc->cw, -(sc->ce + 1)); /* * The attempt to reopen it with a exclusive count */ error = g_access(sc->opath, 0, 0, 1); if (error) { /* * Ok, ditch this consumer, we can't use it. */ printf("Drop old path (%s) on fox (%s)\n", sc->opath->provider->name, gp->name); g_detach(sc->opath); g_destroy_consumer(sc->opath); if (LIST_EMPTY(&gp->consumer)) { /* No consumers left */ g_wither_geom(gp, ENXIO); for (;;) { bp = TAILQ_FIRST(&sc->queue); if (bp == NULL) break; TAILQ_REMOVE(&sc->queue, bp, bio_queue); bp->bio_error = ENXIO; g_std_done(bp); } return; } } else { printf("Got e-bit on old path (%s) on fox (%s)\n", sc->opath->provider->name, gp->name); } sc->opath = NULL; } else { cp1 = LIST_FIRST(&gp->consumer); } if (cp1 == NULL) cp1 = LIST_FIRST(&gp->consumer); printf("Open new path (%s) on fox (%s)\n", cp1->provider->name, gp->name); error = g_access(cp1, sc->cr, sc->cw, sc->ce); if (error) { /* * If we failed, we take another trip through here */ printf("Open new path (%s) on fox (%s) failed, reselect.\n", cp1->provider->name, gp->name); sc->opath = cp1; g_post_event(g_fox_select_path, gp, M_WAITOK, gp, NULL); } else { printf("Open new path (%s) on fox (%s) succeeded\n", cp1->provider->name, gp->name); mtx_lock(&sc->lock); sc->path = cp1; sc->waiting = 0; for (;;) { bp = TAILQ_FIRST(&sc->queue); if (bp == NULL) break; TAILQ_REMOVE(&sc->queue, bp, bio_queue); g_io_request(bp, sc->path); } mtx_unlock(&sc->lock); } } static void g_fox_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_fox_softc *sc; int error, mark; g_topology_assert(); gp = cp->geom; sc = gp->softc; printf("Removing path (%s) from fox (%s)\n", cp->provider->name, gp->name); mtx_lock(&sc->lock); if (cp == sc->path) { sc->opath = NULL; sc->path = NULL; sc->waiting = 1; mark = 1; } else { mark = 0; } mtx_unlock(&sc->lock); g_access(cp, -cp->acr, -cp->acw, -cp->ace); error = cp->provider->error; g_detach(cp); g_destroy_consumer(cp); if (!LIST_EMPTY(&gp->consumer)) { if (mark) g_post_event(g_fox_select_path, gp, M_WAITOK, gp, NULL); return; } mtx_destroy(&sc->lock); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); } static void g_fox_done(struct bio *bp) { struct g_geom *gp; struct g_fox_softc *sc; int error; if (bp->bio_error == 0) { g_std_done(bp); return; } gp = bp->bio_from->geom; sc = gp->softc; if (bp->bio_from != sc->path) { g_io_request(bp, sc->path); return; } mtx_lock(&sc->lock); sc->opath = sc->path; sc->path = NULL; error = g_post_event(g_fox_select_path, gp, M_NOWAIT, gp, NULL); if (error) { bp->bio_error = ENOMEM; g_std_done(bp); } else { sc->waiting = 1; TAILQ_INSERT_TAIL(&sc->queue, bp, bio_queue); } mtx_unlock(&sc->lock); } static void g_fox_start(struct bio *bp) { struct g_geom *gp; struct bio *bp2; struct g_fox_softc *sc; int error; gp = bp->bio_to->geom; sc = gp->softc; if (sc == NULL) { g_io_deliver(bp, ENXIO); return; } switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); break; } bp2->bio_offset += sc->sectorsize; bp2->bio_done = g_fox_done; mtx_lock(&sc->lock); if (sc->path == NULL || !TAILQ_EMPTY(&sc->queue)) { if (sc->waiting == 0) { error = g_post_event(g_fox_select_path, gp, M_NOWAIT, gp, NULL); if (error) { g_destroy_bio(bp2); bp2 = NULL; g_io_deliver(bp, error); } else { sc->waiting = 1; } } if (bp2 != NULL) TAILQ_INSERT_TAIL(&sc->queue, bp2, bio_queue); } else { g_io_request(bp2, sc->path); } mtx_unlock(&sc->lock); break; default: g_io_deliver(bp, EOPNOTSUPP); break; } return; } static int g_fox_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_fox_softc *sc; struct g_consumer *cp1; int error; g_topology_assert(); gp = pp->geom; sc = gp->softc; if (sc == NULL) { if (dr <= 0 && dw <= 0 && de <= 0) return (0); else return (ENXIO); } if (sc->cr == 0 && sc->cw == 0 && sc->ce == 0) { /* * First open, open all consumers with an exclusive bit */ error = 0; LIST_FOREACH(cp1, &gp->consumer, consumer) { error = g_access(cp1, 0, 0, 1); if (error) { printf("FOX: access(%s,0,0,1) = %d\n", cp1->provider->name, error); break; } } if (error) { LIST_FOREACH(cp1, &gp->consumer, consumer) { if (cp1->ace) g_access(cp1, 0, 0, -1); } return (error); } } if (sc->path == NULL) g_fox_select_path(gp, 0); if (sc->path == NULL) error = ENXIO; else error = g_access(sc->path, dr, dw, de); if (error == 0) { sc->cr += dr; sc->cw += dw; sc->ce += de; if (sc->cr == 0 && sc->cw == 0 && sc->ce == 0) { /* * Last close, remove e-bit on all consumers */ LIST_FOREACH(cp1, &gp->consumer, consumer) g_access(cp1, 0, 0, -1); } } return (error); } static struct g_geom * g_fox_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp, *gp2; struct g_provider *pp2; struct g_consumer *cp, *cp2; struct g_fox_softc *sc, *sc2; int error; u_int sectorsize; u_char *buf; g_trace(G_T_TOPOLOGY, "fox_taste(%s, %s)", mp->name, pp->name); g_topology_assert(); if (!strcmp(pp->geom->class->name, mp->name)) return (NULL); gp = g_new_geomf(mp, "%s.fox", pp->name); gp->softc = g_malloc(sizeof(struct g_fox_softc), M_WAITOK | M_ZERO); sc = gp->softc; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, 0, 0); if (error) { g_free(sc); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return(NULL); } do { sectorsize = cp->provider->sectorsize; g_topology_unlock(); buf = g_read_data(cp, 0, sectorsize, NULL); g_topology_lock(); if (buf == NULL) break; if (memcmp(buf, FOX_MAGIC, strlen(FOX_MAGIC))) break; /* * First we need to see if this a new path for an existing fox. */ LIST_FOREACH(gp2, &mp->geom, geom) { sc2 = gp2->softc; if (sc2 == NULL) continue; if (memcmp(buf + 16, sc2->magic, sizeof sc2->magic)) continue; break; } if (gp2 != NULL) { /* * It was. Create a new consumer for that fox, * attach it, and if the fox is open, open this * path with an exclusive count of one. */ printf("Adding path (%s) to fox (%s)\n", pp->name, gp2->name); cp2 = g_new_consumer(gp2); g_attach(cp2, pp); pp2 = LIST_FIRST(&gp2->provider); if (pp2->acr > 0 || pp2->acw > 0 || pp2->ace > 0) { error = g_access(cp2, 0, 0, 1); if (error) { /* * This is bad, or more likely, * the user is doing something stupid */ printf( "WARNING: New path (%s) to fox(%s) not added: %s\n%s", cp2->provider->name, gp2->name, "Could not get exclusive bit.", "WARNING: This indicates a risk of data inconsistency." ); g_detach(cp2); g_destroy_consumer(cp2); } } break; } printf("Creating new fox (%s)\n", pp->name); sc->path = cp; memcpy(sc->magic, buf + 16, sizeof sc->magic); pp2 = g_new_providerf(gp, "%s", gp->name); pp2->mediasize = sc->mediasize = pp->mediasize - pp->sectorsize; pp2->sectorsize = sc->sectorsize = pp->sectorsize; printf("fox %s lock %p\n", gp->name, &sc->lock); mtx_init(&sc->lock, "fox queue", NULL, MTX_DEF); TAILQ_INIT(&sc->queue); g_error_provider(pp2, 0); } while (0); if (buf != NULL) g_free(buf); g_access(cp, -1, 0, 0); if (!LIST_EMPTY(&gp->provider)) { if (!g_fox_once) { g_fox_once = 1; printf( "WARNING: geom_fox (geom %s) is deprecated, " "use gmultipath instead.\n", gp->name); } return (gp); } g_free(gp->softc); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static int g_fox_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_fox_softc *sc; g_topology_assert(); sc = gp->softc; mtx_destroy(&sc->lock); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static struct g_class g_fox_class = { .name = FOX_CLASS_NAME, .version = G_VERSION, .taste = g_fox_taste, .destroy_geom = g_fox_destroy_geom, .start = g_fox_start, .spoiled = g_fox_orphan, .orphan = g_fox_orphan, .access= g_fox_access, }; DECLARE_GEOM_CLASS(g_fox_class, g_fox); Index: head/sys/geom/geom_int.h =================================================================== --- head/sys/geom/geom_int.h (revision 326269) +++ head/sys/geom/geom_int.h (revision 326270) @@ -1,85 +1,87 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ LIST_HEAD(class_list_head, g_class); TAILQ_HEAD(g_tailq_head, g_geom); extern int g_collectstats; #define G_STATS_PROVIDERS 1 /* Collect I/O stats for providers */ #define G_STATS_CONSUMERS 2 /* Collect I/O stats for consumers */ extern int g_debugflags; /* * 1 G_T_TOPOLOGY * 2 G_T_BIO * 4 G_T_ACCESS * 8 (unused) * 16 Allow footshooting on rank#1 providers * 32 G_T_DETAILS */ #define G_F_DISKIOCTL 64 #define G_F_CTLDUMP 128 /* geom_dump.c */ void g_confxml(void *, int flag); void g_conf_specific(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp); void g_conf_printf_escaped(struct sbuf *sb, const char *fmt, ...); void g_confdot(void *, int flag); void g_conftxt(void *, int flag); /* geom_event.c */ void g_event_init(void); void g_run_events(void); void g_do_wither(void); /* geom_subr.c */ extern struct class_list_head g_classes; extern char *g_wait_event, *g_wait_sim, *g_wait_up, *g_wait_down; void g_wither_washer(void); /* geom_io.c */ void g_io_init(void); void g_io_schedule_down(struct thread *tp); void g_io_schedule_up(struct thread *tp); /* geom_kern.c / geom_kernsim.c */ void g_init(void); extern int g_shutdown; extern int g_notaste; /* geom_ctl.c */ void g_ctl_init(void); Index: head/sys/geom/geom_io.c =================================================================== --- head/sys/geom/geom_io.c (revision 326269) +++ head/sys/geom/geom_io.c (revision 326270) @@ -1,1057 +1,1059 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ static volatile u_int pace; static uma_zone_t biozone; /* * The head of the list of classifiers used in g_io_request. * Use g_register_classifier() and g_unregister_classifier() * to add/remove entries to the list. * Classifiers are invoked in registration order. */ static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook) g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq); #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; if (bp->bio_cmd == BIO_ZONE) bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; bp2->bio_classifier2 = bp->bio_classifier2; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) bp2->bio_track_bp = bp->bio_track_bp; #endif bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); bp = g_alloc_bio(); bp->bio_cmd = BIO_ZONE; bp->bio_done = NULL; /* * XXX KDM need to handle report zone data. */ bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) bp->bio_length = zone_args->zone_params.report.entries_allocated * sizeof(struct disk_zone_rep_entry); else bp->bio_length = 0; g_io_request(bp, cp); error = biowait(bp, "gzone"); bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; biotrack(bp, __func__); cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; case BIO_ZONE: if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { if (cp->acr == 0) return (EPERM); } else if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } /* * bio classification support. * * g_register_classifier() and g_unregister_classifier() * are used to add/remove a classifier from the list. * The list is protected using the g_bio_run_down lock, * because the classifiers are called in this path. * * g_io_request() passes bio's that are not already classified * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers(). * Classifiers can store their result in the two fields * bio_classifier1 and bio_classifier2. * A classifier that updates one of the fields should * return a non-zero value. * If no classifier updates the field, g_run_classifiers() sets * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls. */ int g_register_classifier(struct g_classifier_hook *hook) { g_bioq_lock(&g_bio_run_down); TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link); g_bioq_unlock(&g_bio_run_down); return (0); } void g_unregister_classifier(struct g_classifier_hook *hook) { struct g_classifier_hook *entry; g_bioq_lock(&g_bio_run_down); TAILQ_FOREACH(entry, &g_classifier_tailq, link) { if (entry == hook) { TAILQ_REMOVE(&g_classifier_tailq, hook, link); break; } } g_bioq_unlock(&g_bio_run_down); } static void g_run_classifiers(struct bio *bp) { struct g_classifier_hook *hook; int classified = 0; biotrack(bp, __func__); TAILQ_FOREACH(hook, &g_classifier_tailq, link) classified |= hook->func(hook->arg, bp); if (!classified) bp->bio_classifier1 = BIO_NOTCLASSIFIED; } void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; struct mtx *mtxp; int direct, error, first; uint8_t cmd; biotrack(bp, __func__); KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) { g_bioq_lock(&g_bio_run_down); g_run_classifiers(bp); g_bioq_unlock(&g_bio_run_down); } /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ mtxp = mtx_pool_find(mtxpool_sleep, pp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_start_transaction(pp->stat, &bp->bio_t0); if (g_collectstats & G_STATS_CONSUMERS) devstat_start_transaction(cp->stat, &bp->bio_t0); pp->nstart++; cp->nstart++; mtx_unlock(mtxp); if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; biotrack(bp, __func__); KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); cp->nend++; pp->nend++; mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); biotrack(bp, __func__); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_up); if (bp == NULL) { CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void g_print_bio(struct bio *bp) { const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; printf("%s[%s]", pname, cmd); return; case BIO_ZONE: { char *subcmd = NULL; cmd = "ZONE"; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: subcmd = "OPEN"; break; case DISK_ZONE_CLOSE: subcmd = "CLOSE"; break; case DISK_ZONE_FINISH: subcmd = "FINISH"; break; case DISK_ZONE_RWP: subcmd = "RWP"; break; case DISK_ZONE_REPORT_ZONES: subcmd = "REPORT ZONES"; break; case DISK_ZONE_GET_PARAMS: subcmd = "GET PARAMS"; break; default: subcmd = "UNKNOWN"; break; } printf("%s[%s,%s]", pname, cmd, subcmd); return; } case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; printf("%s[%s()]", pname, cmd); return; } printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: head/sys/geom/geom_kern.c =================================================================== --- head/sys/geom/geom_kern.c (revision 326269) +++ head/sys/geom/geom_kern.c (revision 326270) @@ -1,229 +1,231 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures"); struct sx topology_lock; static struct proc *g_proc; static struct thread *g_up_td; static struct thread *g_down_td; static struct thread *g_event_td; int g_debugflags; int g_collectstats = 1; int g_shutdown; int g_notaste; /* * G_UP and G_DOWN are the two threads which push I/O through the * stack. * * Things are procesed in a FIFO order, but these threads could be * part of I/O prioritization by deciding which bios/bioqs to service * in what order. * * We have only one thread in each direction, it is believed that until * a very non-trivial workload in the UP/DOWN path this will be enough, * but more than one can actually be run without problems. * * Holding the "mymutex" is a debugging feature: It prevents people * from sleeping in the UP/DOWN I/O path by mistake or design (doing * so almost invariably result in deadlocks since it stalls all I/O * processing in the given direction. */ static void g_up_procbody(void *arg) { thread_lock(g_up_td); sched_prio(g_up_td, PRIBIO); thread_unlock(g_up_td); for(;;) { g_io_schedule_up(g_up_td); } } static void g_down_procbody(void *arg) { thread_lock(g_down_td); sched_prio(g_down_td, PRIBIO); thread_unlock(g_down_td); for(;;) { g_io_schedule_down(g_down_td); } } static void g_event_procbody(void *arg) { thread_lock(g_event_td); sched_prio(g_event_td, PRIBIO); thread_unlock(g_event_td); g_run_events(); /* NOTREACHED */ } int g_is_geom_thread(struct thread *td) { return (td == g_up_td || td == g_down_td || td == g_event_td); } static void geom_shutdown(void *foo __unused) { g_shutdown = 1; } void g_init(void) { g_trace(G_T_TOPOLOGY, "g_ignition"); sx_init(&topology_lock, "GEOM topology"); g_io_init(); g_event_init(); g_ctl_init(); kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td, RFHIGHPID, 0, "geom", "g_event"); kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td, RFHIGHPID, 0, "geom", "g_up"); kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td, RFHIGHPID, 0, "geom", "g_down"); EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL, SHUTDOWN_PRI_FIRST); } static int sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_conftxt, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } static int sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_confdot, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } static int sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_confxml, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW, 0, "GEOMetry management"); SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_confxml, "", "Dump the GEOM config in XML"); SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_confdot, "", "Dump the GEOM config in dot"); SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_kern_geom_conftxt, "", "Dump the GEOM config in txt"); SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN, &g_debugflags, 0, "Set various trace levels for GEOM debugging"); SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW, &g_notaste, 0, "Prevent GEOM tasting"); SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW, &g_collectstats, 0, "Control statistics collection on GEOM providers and consumers"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)"); Index: head/sys/geom/geom_map.c =================================================================== --- head/sys/geom/geom_map.c (revision 326269) +++ head/sys/geom/geom_map.c (revision 326270) @@ -1,407 +1,409 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010-2011 Aleksandr Rybalko * based on geom_redboot.c * Copyright (c) 2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAP_CLASS_NAME "MAP" #define MAP_MAXSLICE 64 #define MAP_MAX_MARKER_LEN 64 struct g_map_softc { off_t offset[MAP_MAXSLICE]; /* offset in flash */ off_t size[MAP_MAXSLICE]; /* image size in bytes */ off_t entry[MAP_MAXSLICE]; off_t dsize[MAP_MAXSLICE]; uint8_t readonly[MAP_MAXSLICE]; g_access_t *parent_access; }; static int g_map_access(struct g_provider *pp, int dread, int dwrite, int dexcl) { struct g_geom *gp; struct g_slicer *gsp; struct g_map_softc *sc; gp = pp->geom; gsp = gp->softc; sc = gsp->softc; if (dwrite > 0 && sc->readonly[pp->index]) return (EPERM); return (sc->parent_access(pp, dread, dwrite, dexcl)); } static int g_map_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_map_softc *sc; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; sc = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, MAP_CLASS_NAME "::entry", sc->entry[idx])) { return (1); } if (g_handleattr_int(bp, MAP_CLASS_NAME "::dsize", sc->dsize[idx])) { return (1); } } return (0); } static void g_map_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_map_softc *sc; struct g_slicer *gsp; gsp = gp->softc; sc = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " entry %jd", (intmax_t)sc->entry[pp->index]); sbuf_printf(sb, " dsize %jd", (intmax_t)sc->dsize[pp->index]); } else { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->entry[pp->index]); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->dsize[pp->index]); } } } static int find_marker(struct g_consumer *cp, const char *line, off_t *offset) { off_t search_start, search_offset, search_step; size_t sectorsize; uint8_t *buf; char *op, key[MAP_MAX_MARKER_LEN], search_key[MAP_MAX_MARKER_LEN]; int ret, c; /* Try convert to numeric first */ *offset = strtouq(line, &op, 0); if (*op == '\0') return (0); bzero(search_key, MAP_MAX_MARKER_LEN); sectorsize = cp->provider->sectorsize; #ifdef __LP64__ ret = sscanf(line, "search:%li:%li:%63c", &search_start, &search_step, search_key); #else ret = sscanf(line, "search:%qi:%qi:%63c", &search_start, &search_step, search_key); #endif if (ret < 3) return (1); if (bootverbose) { printf("MAP: search %s for key \"%s\" from 0x%jx, step 0x%jx\n", cp->geom->name, search_key, (intmax_t)search_start, (intmax_t)search_step); } /* error if search_key is empty */ if (strlen(search_key) < 1) return (1); /* sscanf successful, and we start marker search */ for (search_offset = search_start; search_offset < cp->provider->mediasize; search_offset += search_step) { g_topology_unlock(); buf = g_read_data(cp, rounddown(search_offset, sectorsize), roundup(strlen(search_key), sectorsize), NULL); g_topology_lock(); /* * Don't bother doing the rest if buf==NULL; eg derefencing * to assemble 'key'. */ if (buf == NULL) continue; /* Wildcard, replace '.' with byte from data */ /* TODO: add support wildcard escape '\.' */ strncpy(key, search_key, MAP_MAX_MARKER_LEN); for (c = 0; c < MAP_MAX_MARKER_LEN && key[c]; c++) { if (key[c] == '.') { key[c] = ((char *)(buf + (search_offset % sectorsize)))[c]; } } /* Assume buf != NULL here */ if (memcmp(buf + search_offset % sectorsize, key, strlen(search_key)) == 0) { g_free(buf); /* Marker found, so return their offset */ *offset = search_offset; return (0); } g_free(buf); } /* Marker not found */ return (1); } static int g_map_parse_part(struct g_class *mp, struct g_provider *pp, struct g_consumer *cp, struct g_geom *gp, struct g_map_softc *sc, int i) { const char *value, *name; char *op; off_t start, end, offset, size, dsize; int readonly, ret; /* hint.map.0.at="cfid0" - bind to cfid0 media */ if (resource_string_value("map", i, "at", &value) != 0) return (1); /* Check if this correct provider */ if (strcmp(pp->name, value) != 0) return (1); /* * hint.map.0.name="uboot" - name of partition, will be available * as "/dev/map/uboot" */ if (resource_string_value("map", i, "name", &name) != 0) { if (bootverbose) printf("MAP: hint.map.%d has no name\n", i); return (1); } /* * hint.map.0.start="0x00010000" - partition start at 0x00010000 * or hint.map.0.start="search:0x00010000:0x200:marker text" - * search for text "marker text", begin at 0x10000, step 0x200 * until we found marker or end of media reached */ if (resource_string_value("map", i, "start", &value) != 0) { if (bootverbose) printf("MAP: \"%s\" has no start value\n", name); return (1); } if (find_marker(cp, value, &start) != 0) { if (bootverbose) { printf("MAP: \"%s\" can't parse/use start value\n", name); } return (1); } /* like "start" */ if (resource_string_value("map", i, "end", &value) != 0) { if (bootverbose) printf("MAP: \"%s\" has no end value\n", name); return (1); } if (find_marker(cp, value, &end) != 0) { if (bootverbose) { printf("MAP: \"%s\" can't parse/use end value\n", name); } return (1); } /* variable readonly optional, disable write access */ if (resource_int_value("map", i, "readonly", &readonly) != 0) readonly = 0; /* offset of partition data, from partition begin */ if (resource_string_value("map", i, "offset", &value) == 0) { offset = strtouq(value, &op, 0); if (*op != '\0') { if (bootverbose) { printf("MAP: \"%s\" can't parse offset\n", name); } return (1); } } else { offset = 0; } /* partition data size */ if (resource_string_value("map", i, "dsize", &value) == 0) { dsize = strtouq(value, &op, 0); if (*op != '\0') { if (bootverbose) { printf("MAP: \"%s\" can't parse dsize\n", name); } return (1); } } else { dsize = 0; } size = end - start; if (dsize == 0) dsize = size - offset; /* end is 0 or size is 0, No MAP - so next */ if (end < start) { if (bootverbose) { printf("MAP: \"%s\", \"end\" less than " "\"start\"\n", name); } return (1); } if (offset + dsize > size) { if (bootverbose) { printf("MAP: \"%s\", \"dsize\" bigger than " "partition - offset\n", name); } return (1); } ret = g_slice_config(gp, i, G_SLICE_CONFIG_SET, start + offset, dsize, cp->provider->sectorsize, "map/%s", name); if (ret != 0) { if (bootverbose) { printf("MAP: g_slice_config returns %d for \"%s\"\n", ret, name); } return (1); } if (bootverbose) { printf("MAP: %s: %jxx%jx, data=%jxx%jx " "\"/dev/map/%s\"\n", cp->geom->name, (intmax_t)start, (intmax_t)size, (intmax_t)offset, (intmax_t)dsize, name); } sc->offset[i] = start; sc->size[i] = size; sc->entry[i] = offset; sc->dsize[i] = dsize; sc->readonly[i] = readonly ? 1 : 0; return (0); } static struct g_geom * g_map_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_map_softc *sc; struct g_consumer *cp; struct g_geom *gp; int i; g_trace(G_T_TOPOLOGY, "map_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (strcmp(pp->geom->class->name, MAP_CLASS_NAME) == 0) return (NULL); gp = g_slice_new(mp, MAP_MAXSLICE, pp, &cp, &sc, sizeof(*sc), g_map_start); if (gp == NULL) return (NULL); /* interpose our access method */ sc->parent_access = gp->access; gp->access = g_map_access; for (i = 0; i < MAP_MAXSLICE; i++) g_map_parse_part(mp, pp, cp, gp, sc, i); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { if (bootverbose) printf("MAP: No valid partition found at %s\n", pp->name); g_slice_spoiled(cp); return (NULL); } return (gp); } static void g_map_config(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_geom *gp; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; gctl_error(req, "Unknown verb"); } static struct g_class g_map_class = { .name = MAP_CLASS_NAME, .version = G_VERSION, .taste = g_map_taste, .dumpconf = g_map_dumpconf, .ctlreq = g_map_config, }; DECLARE_GEOM_CLASS(g_map_class, g_map); Index: head/sys/geom/geom_mbr.c =================================================================== --- head/sys/geom/geom_mbr.c (revision 326269) +++ head/sys/geom/geom_mbr.c (revision 326270) @@ -1,528 +1,530 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mbr, "GEOM DOS/MBR partitioning support"); #define MBR_CLASS_NAME "MBR" #define MBREXT_CLASS_NAME "MBREXT" static int g_mbr_once = 0; static struct dos_partition historical_bogus_partition_table[NDOSPART] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, }, }; static struct dos_partition historical_bogus_partition_table_fixed[NDOSPART] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0x80, 0, 1, 0, DOSPTYP_386BSD, 254, 255, 255, 0, 50000, }, }; static void g_mbr_print(int i, struct dos_partition *dp) { printf("[%d] f:%02x typ:%d", i, dp->dp_flag, dp->dp_typ); printf(" s(CHS):%d/%d/%d", DPCYL(dp->dp_scyl, dp->dp_ssect), dp->dp_shd, DPSECT(dp->dp_ssect)); printf(" e(CHS):%d/%d/%d", DPCYL(dp->dp_ecyl, dp->dp_esect), dp->dp_ehd, DPSECT(dp->dp_esect)); printf(" s:%d l:%d\n", dp->dp_start, dp->dp_size); } struct g_mbr_softc { int type [NDOSPART]; u_int sectorsize; u_char sec0[512]; u_char slicesum[16]; }; /* * XXX: Add gctl_req arg and give good error msgs. * XXX: Check that length argument does not bring boot code inside any slice. */ static int g_mbr_modify(struct g_geom *gp, struct g_mbr_softc *ms, u_char *sec0, int len __unused) { int i, error; off_t l[NDOSPART]; struct dos_partition ndp[NDOSPART], *dp; MD5_CTX md5sum; g_topology_assert(); if (sec0[0x1fe] != 0x55 && sec0[0x1ff] != 0xaa) return (EBUSY); dp = ndp; for (i = 0; i < NDOSPART; i++) { dos_partition_dec( sec0 + DOSPARTOFF + i * sizeof(struct dos_partition), dp + i); } if ((!bcmp(dp, historical_bogus_partition_table, sizeof historical_bogus_partition_table)) || (!bcmp(dp, historical_bogus_partition_table_fixed, sizeof historical_bogus_partition_table_fixed))) { /* * We will not allow people to write these from "the inside", * Since properly selfdestructing takes too much code. If * people really want to do this, they cannot have any * providers of this geom open, and in that case they can just * as easily overwrite the MBR in the parent device. */ return(EBUSY); } for (i = 0; i < NDOSPART; i++) { /* * A Protective MBR (PMBR) has a single partition of * type 0xEE spanning the whole disk. Such a MBR * protects a GPT on the disk from MBR tools that * don't know anything about GPT. We're interpreting * it a bit more loosely: any partition of type 0xEE * is to be skipped as it doesn't contain any data * that we should care about. We still allow other * partitions to be present in the MBR. A PMBR will * be handled correctly anyway. */ if (dp[i].dp_typ == DOSPTYP_PMBR) l[i] = 0; else if (dp[i].dp_flag != 0 && dp[i].dp_flag != 0x80) l[i] = 0; else if (dp[i].dp_typ == 0) l[i] = 0; else l[i] = (off_t)dp[i].dp_size * ms->sectorsize; error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, (off_t)dp[i].dp_start * ms->sectorsize, l[i], ms->sectorsize, "%ss%d", gp->name, 1 + i); if (error) return (error); } for (i = 0; i < NDOSPART; i++) { ms->type[i] = dp[i].dp_typ; g_slice_config(gp, i, G_SLICE_CONFIG_SET, (off_t)dp[i].dp_start * ms->sectorsize, l[i], ms->sectorsize, "%ss%d", gp->name, 1 + i); } bcopy(sec0, ms->sec0, 512); /* * Calculate MD5 from the first sector and use it for avoiding * recursive slices creation. */ MD5Init(&md5sum); MD5Update(&md5sum, ms->sec0, sizeof(ms->sec0)); MD5Final(ms->slicesum, &md5sum); return (0); } static int g_mbr_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_geom *gp; struct g_mbr_softc *ms; struct g_slicer *gsp; struct g_consumer *cp; int error, opened; gp = pp->geom; gsp = gp->softc; ms = gsp->softc; opened = 0; error = 0; switch(cmd) { case DIOCSMBR: { if (!(fflag & FWRITE)) return (EPERM); g_topology_lock(); cp = LIST_FIRST(&gp->consumer); if (cp->acw == 0) { error = g_access(cp, 0, 1, 0); if (error == 0) opened = 1; } if (!error) error = g_mbr_modify(gp, ms, data, 512); if (!error) error = g_write_data(cp, 0, data, 512); if (opened) g_access(cp, 0, -1 , 0); g_topology_unlock(); return(error); } default: return (ENOIOCTL); } } static int g_mbr_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_mbr_softc *mp; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; mp = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, "MBR::type", mp->type[idx])) return (1); if (g_handleattr_off_t(bp, "MBR::offset", gsp->slices[idx].offset)) return (1); if (g_handleattr(bp, "MBR::slicesum", mp->slicesum, sizeof(mp->slicesum))) return (1); } return (0); } static void g_mbr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_mbr_softc *mp; struct g_slicer *gsp; gsp = gp->softc; mp = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { if (indent == NULL) sbuf_printf(sb, " ty %d", mp->type[pp->index]); else sbuf_printf(sb, "%s%d\n", indent, mp->type[pp->index]); } } static struct g_geom * g_mbr_taste(struct g_class *mp, struct g_provider *pp, int insist) { struct g_geom *gp; struct g_consumer *cp; int error; struct g_mbr_softc *ms; u_int fwsectors, sectorsize; u_char *buf; u_char hash[16]; MD5_CTX md5sum; g_trace(G_T_TOPOLOGY, "mbr_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (!strcmp(pp->geom->class->name, MBR_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, NDOSPART, pp, &cp, &ms, sizeof *ms, g_mbr_start); if (gp == NULL) return (NULL); g_topology_unlock(); do { error = g_getattr("GEOM::fwsectors", cp, &fwsectors); if (error) fwsectors = 17; sectorsize = cp->provider->sectorsize; if (sectorsize < 512) break; ms->sectorsize = sectorsize; buf = g_read_data(cp, 0, sectorsize, NULL); if (buf == NULL) break; /* * Calculate MD5 from the first sector and use it for avoiding * recursive slices creation. */ bcopy(buf, ms->sec0, 512); MD5Init(&md5sum); MD5Update(&md5sum, ms->sec0, sizeof(ms->sec0)); MD5Final(ms->slicesum, &md5sum); error = g_getattr("MBR::slicesum", cp, &hash); if (!error && !bcmp(ms->slicesum, hash, sizeof(hash))) { g_free(buf); break; } g_topology_lock(); g_mbr_modify(gp, ms, buf, 512); g_topology_unlock(); g_free(buf); break; } while (0); g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } if (!g_mbr_once) { g_mbr_once = 1; printf( "WARNING: geom_mbr (geom %s) is deprecated, " "use gpart instead.\n", gp->name); } return (gp); } static void g_mbr_config(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_geom *gp; struct g_consumer *cp; struct g_mbr_softc *ms; struct g_slicer *gsp; int opened = 0, error = 0; void *data; int len; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; if (strcmp(verb, "write MBR")) { gctl_error(req, "Unknown verb"); return; } gsp = gp->softc; ms = gsp->softc; data = gctl_get_param(req, "data", &len); if (data == NULL) return; if (len < 512 || (len % 512)) { gctl_error(req, "Wrong request length"); return; } cp = LIST_FIRST(&gp->consumer); if (cp->acw == 0) { error = g_access(cp, 0, 1, 0); if (error == 0) opened = 1; } if (!error) error = g_mbr_modify(gp, ms, data, len); if (error) gctl_error(req, "conflict with open slices"); if (!error) error = g_write_data(cp, 0, data, len); if (error) gctl_error(req, "sector zero write failed"); if (opened) g_access(cp, 0, -1 , 0); return; } static struct g_class g_mbr_class = { .name = MBR_CLASS_NAME, .version = G_VERSION, .taste = g_mbr_taste, .dumpconf = g_mbr_dumpconf, .ctlreq = g_mbr_config, .ioctl = g_mbr_ioctl, }; DECLARE_GEOM_CLASS(g_mbr_class, g_mbr); #define NDOSEXTPART 32 struct g_mbrext_softc { int type [NDOSEXTPART]; }; static int g_mbrext_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_mbrext_softc *mp; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; mp = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, "MBR::type", mp->type[idx])) return (1); } return (0); } static void g_mbrext_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_mbrext_softc *mp; struct g_slicer *gsp; g_slice_dumpconf(sb, indent, gp, cp, pp); gsp = gp->softc; mp = gsp->softc; if (pp != NULL) { if (indent == NULL) sbuf_printf(sb, " ty %d", mp->type[pp->index]); else sbuf_printf(sb, "%s%d\n", indent, mp->type[pp->index]); } } static struct g_geom * g_mbrext_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_consumer *cp; int error, i, slice; struct g_mbrext_softc *ms; off_t off; u_char *buf; struct dos_partition dp[4]; u_int fwsectors, sectorsize; g_trace(G_T_TOPOLOGY, "g_mbrext_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (strcmp(pp->geom->class->name, MBR_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, NDOSEXTPART, pp, &cp, &ms, sizeof *ms, g_mbrext_start); if (gp == NULL) return (NULL); g_topology_unlock(); off = 0; slice = 0; do { error = g_getattr("MBR::type", cp, &i); if (error || (i != DOSPTYP_EXT && i != DOSPTYP_EXTLBA)) break; error = g_getattr("GEOM::fwsectors", cp, &fwsectors); if (error) fwsectors = 17; sectorsize = cp->provider->sectorsize; if (sectorsize != 512) break; for (;;) { buf = g_read_data(cp, off, sectorsize, NULL); if (buf == NULL) break; if (buf[0x1fe] != 0x55 && buf[0x1ff] != 0xaa) { g_free(buf); break; } for (i = 0; i < NDOSPART; i++) dos_partition_dec( buf + DOSPARTOFF + i * sizeof(struct dos_partition), dp + i); g_free(buf); if (0 && bootverbose) { printf("MBREXT Slice %d on %s:\n", slice + 5, gp->name); g_mbr_print(0, dp); g_mbr_print(1, dp + 1); } if ((dp[0].dp_flag & 0x7f) == 0 && dp[0].dp_size != 0 && dp[0].dp_typ != 0) { g_topology_lock(); g_slice_config(gp, slice, G_SLICE_CONFIG_SET, (((off_t)dp[0].dp_start) << 9ULL) + off, ((off_t)dp[0].dp_size) << 9ULL, sectorsize, "%*.*s%d", (int)strlen(gp->name) - 1, (int)strlen(gp->name) - 1, gp->name, slice + 5); g_topology_unlock(); ms->type[slice] = dp[0].dp_typ; slice++; } if (dp[1].dp_flag != 0) break; if (dp[1].dp_typ != DOSPTYP_EXT && dp[1].dp_typ != DOSPTYP_EXTLBA) break; if (dp[1].dp_size == 0) break; off = ((off_t)dp[1].dp_start) << 9ULL; } break; } while (0); g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static struct g_class g_mbrext_class = { .name = MBREXT_CLASS_NAME, .version = G_VERSION, .taste = g_mbrext_taste, .dumpconf = g_mbrext_dumpconf, }; DECLARE_GEOM_CLASS(g_mbrext_class, g_mbrext); Index: head/sys/geom/geom_mbr_enc.c =================================================================== --- head/sys/geom/geom_mbr_enc.c (revision 326269) +++ head/sys/geom/geom_mbr_enc.c (revision 326270) @@ -1,73 +1,75 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Functions to encode or decode struct dos_partition into a bytestream * of correct endianness and packing. These functions do no validation * or sanity checking, they only pack/unpack the fields correctly. * * NB! This file must be usable both in kernel and userland. */ #include __FBSDID("$FreeBSD$"); #include #include #include void dos_partition_dec(void const *pp, struct dos_partition *d) { unsigned char const *p = pp; d->dp_flag = p[0]; d->dp_shd = p[1]; d->dp_ssect = p[2]; d->dp_scyl = p[3]; d->dp_typ = p[4]; d->dp_ehd = p[5]; d->dp_esect = p[6]; d->dp_ecyl = p[7]; d->dp_start = le32dec(p + 8); d->dp_size = le32dec(p + 12); } void dos_partition_enc(void *pp, struct dos_partition *d) { unsigned char *p = pp; p[0] = d->dp_flag; p[1] = d->dp_shd; p[2] = d->dp_ssect; p[3] = d->dp_scyl; p[4] = d->dp_typ; p[5] = d->dp_ehd; p[6] = d->dp_esect; p[7] = d->dp_ecyl; le32enc(p + 8, d->dp_start); le32enc(p + 12, d->dp_size); } Index: head/sys/geom/geom_redboot.c =================================================================== --- head/sys/geom/geom_redboot.c (revision 326269) +++ head/sys/geom/geom_redboot.c (revision 326270) @@ -1,357 +1,359 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define REDBOOT_CLASS_NAME "REDBOOT" struct fis_image_desc { uint8_t name[16]; /* null-terminated name */ uint32_t offset; /* offset in flash */ uint32_t addr; /* address in memory */ uint32_t size; /* image size in bytes */ uint32_t entry; /* offset in image for entry point */ uint32_t dsize; /* data size in bytes */ uint8_t pad[256-(16+7*sizeof(uint32_t)+sizeof(void*))]; struct fis_image_desc *next; /* linked list (in memory) */ uint32_t dsum; /* descriptor checksum */ uint32_t fsum; /* checksum over image data */ }; #define FISDIR_NAME "FIS directory" #define REDBCFG_NAME "RedBoot config" #define REDBOOT_NAME "RedBoot" #define REDBOOT_MAXSLICE 64 #define REDBOOT_MAXOFF \ (REDBOOT_MAXSLICE*sizeof(struct fis_image_desc)) struct g_redboot_softc { uint32_t entry[REDBOOT_MAXSLICE]; uint32_t dsize[REDBOOT_MAXSLICE]; uint8_t readonly[REDBOOT_MAXSLICE]; g_access_t *parent_access; }; static void g_redboot_print(int i, struct fis_image_desc *fd) { printf("[%2d] \"%-15.15s\" %08x:%08x", i, fd->name, fd->offset, fd->size); printf(" addr %08x entry %08x\n", fd->addr, fd->entry); printf(" dsize 0x%x dsum 0x%x fsum 0x%x\n", fd->dsize, fd->dsum, fd->fsum); } static int g_redboot_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { return (ENOIOCTL); } static int g_redboot_access(struct g_provider *pp, int dread, int dwrite, int dexcl) { struct g_geom *gp = pp->geom; struct g_slicer *gsp = gp->softc; struct g_redboot_softc *sc = gsp->softc; if (dwrite > 0 && sc->readonly[pp->index]) return (EPERM); return (sc->parent_access(pp, dread, dwrite, dexcl)); } static int g_redboot_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_redboot_softc *sc; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; sc = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, REDBOOT_CLASS_NAME "::entry", sc->entry[idx])) return (1); if (g_handleattr_int(bp, REDBOOT_CLASS_NAME "::dsize", sc->dsize[idx])) return (1); } return (0); } static void g_redboot_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_redboot_softc *sc; struct g_slicer *gsp; gsp = gp->softc; sc = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " entry %d", sc->entry[pp->index]); sbuf_printf(sb, " dsize %d", sc->dsize[pp->index]); } else { sbuf_printf(sb, "%s%d\n", indent, sc->entry[pp->index]); sbuf_printf(sb, "%s%d\n", indent, sc->dsize[pp->index]); } } } #include static int nameok(const char name[16]) { int i; /* descriptor names are null-terminated printable ascii */ for (i = 0; i < 15; i++) if (!isprint(name[i])) break; return (name[i] == '\0'); } static struct fis_image_desc * parse_fis_directory(u_char *buf, size_t bufsize, off_t offset, uint32_t offmask) { #define match(a,b) (bcmp(a, b, sizeof(b)-1) == 0) struct fis_image_desc *fd, *efd; struct fis_image_desc *fisdir, *redbcfg; struct fis_image_desc *head, **tail; int i; fd = (struct fis_image_desc *)buf; efd = fd + (bufsize / sizeof(struct fis_image_desc)); #if 0 /* * Find the start of the FIS table. */ while (fd < efd && fd->name[0] != 0xff) fd++; if (fd == efd) return (NULL); if (bootverbose) printf("RedBoot FIS table starts at 0x%jx\n", offset + fd - (struct fis_image_desc *) buf); #endif /* * Scan forward collecting entries in a list. */ fisdir = redbcfg = NULL; *(tail = &head) = NULL; for (i = 0; fd < efd; i++, fd++) { if (fd->name[0] == 0xff) continue; if (match(fd->name, FISDIR_NAME)) fisdir = fd; else if (match(fd->name, REDBCFG_NAME)) redbcfg = fd; if (nameok(fd->name)) { /* * NB: flash address includes platform mapping; * strip it so we have only a flash offset. */ fd->offset &= offmask; if (bootverbose) g_redboot_print(i, fd); *tail = fd; *(tail = &fd->next) = NULL; } } if (fisdir == NULL) { if (bootverbose) printf("No RedBoot FIS table located at %lu\n", (long) offset); return (NULL); } if (redbcfg != NULL && fisdir->offset + fisdir->size == redbcfg->offset) { /* * Merged FIS/RedBoot config directory. */ if (bootverbose) printf("FIS/RedBoot merged at 0x%jx (not yet)\n", offset + fisdir->offset); /* XXX */ } return head; #undef match } static struct g_geom * g_redboot_taste(struct g_class *mp, struct g_provider *pp, int insist) { struct g_geom *gp; struct g_consumer *cp; struct g_redboot_softc *sc; int error, sectorsize, i; struct fis_image_desc *fd, *head; uint32_t offmask; u_int blksize; /* NB: flash block size stored as stripesize */ u_char *buf; off_t offset; const char *value; char *op; offset = 0; if (resource_string_value("redboot", 0, "fisoffset", &value) == 0) { offset = strtouq(value, &op, 0); if (*op != '\0') { offset = 0; } } g_trace(G_T_TOPOLOGY, "redboot_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (!strcmp(pp->geom->class->name, REDBOOT_CLASS_NAME)) return (NULL); /* XXX only taste flash providers */ if (strncmp(pp->name, "cfi", 3) && strncmp(pp->name, "flash/spi", 9)) return (NULL); gp = g_slice_new(mp, REDBOOT_MAXSLICE, pp, &cp, &sc, sizeof(*sc), g_redboot_start); if (gp == NULL) return (NULL); /* interpose our access method */ sc->parent_access = gp->access; gp->access = g_redboot_access; sectorsize = cp->provider->sectorsize; blksize = cp->provider->stripesize; if (powerof2(cp->provider->mediasize)) offmask = cp->provider->mediasize-1; else offmask = 0xffffffff; /* XXX */ if (bootverbose) printf("%s: mediasize %ld secsize %d blksize %d offmask 0x%x\n", __func__, (long) cp->provider->mediasize, sectorsize, blksize, offmask); if (sectorsize < sizeof(struct fis_image_desc) || (sectorsize % sizeof(struct fis_image_desc))) return (NULL); g_topology_unlock(); head = NULL; if(offset == 0) offset = cp->provider->mediasize - blksize; again: buf = g_read_data(cp, offset, blksize, NULL); if (buf != NULL) head = parse_fis_directory(buf, blksize, offset, offmask); if (head == NULL && offset != 0) { if (buf != NULL) g_free(buf); offset = 0; /* check the front */ goto again; } g_topology_lock(); if (head == NULL) { if (buf != NULL) g_free(buf); return NULL; } /* * Craft a slice for each entry. */ for (fd = head, i = 0; fd != NULL; fd = fd->next) { if (fd->name[0] == '\0') continue; error = g_slice_config(gp, i, G_SLICE_CONFIG_SET, fd->offset, fd->size, sectorsize, "redboot/%s", fd->name); if (error) printf("%s: g_slice_config returns %d for \"%s\"\n", __func__, error, fd->name); sc->entry[i] = fd->entry; sc->dsize[i] = fd->dsize; /* disallow writing hard-to-recover entries */ sc->readonly[i] = (strcmp(fd->name, FISDIR_NAME) == 0) || (strcmp(fd->name, REDBOOT_NAME) == 0); i++; } g_free(buf); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } return (gp); } static void g_redboot_config(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_geom *gp; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; gctl_error(req, "Unknown verb"); } static struct g_class g_redboot_class = { .name = REDBOOT_CLASS_NAME, .version = G_VERSION, .taste = g_redboot_taste, .dumpconf = g_redboot_dumpconf, .ctlreq = g_redboot_config, .ioctl = g_redboot_ioctl, }; DECLARE_GEOM_CLASS(g_redboot_class, g_redboot); Index: head/sys/geom/geom_slice.c =================================================================== --- head/sys/geom/geom_slice.c (revision 326269) +++ head/sys/geom/geom_slice.c (revision 326270) @@ -1,560 +1,562 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include static g_access_t g_slice_access; static g_start_t g_slice_start; static struct g_slicer * g_slice_alloc(unsigned nslice, unsigned scsize) { struct g_slicer *gsp; gsp = g_malloc(sizeof *gsp, M_WAITOK | M_ZERO); if (scsize > 0) gsp->softc = g_malloc(scsize, M_WAITOK | M_ZERO); else gsp->softc = NULL; gsp->slices = g_malloc(nslice * sizeof(struct g_slice), M_WAITOK | M_ZERO); gsp->nslice = nslice; return (gsp); } static void g_slice_free(struct g_geom *gp) { struct g_slicer *gsp; gsp = gp->softc; gp->softc = NULL; /* * We can get multiple spoiled events before wither-washer * detaches our consumer, so this can get called multiple * times. */ if (gsp == NULL) return; g_free(gsp->slices); if (gsp->hotspot != NULL) g_free(gsp->hotspot); if (gsp->softc != NULL) g_free(gsp->softc); g_free(gsp); } static int g_slice_access(struct g_provider *pp, int dr, int dw, int de) { int error; u_int u; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp2; struct g_slicer *gsp; struct g_slice *gsl, *gsl2; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); KASSERT (cp != NULL, ("g_slice_access but no consumer")); gsp = gp->softc; if (dr > 0 || dw > 0 || de > 0) { gsl = &gsp->slices[pp->index]; for (u = 0; u < gsp->nslice; u++) { gsl2 = &gsp->slices[u]; if (gsl2->length == 0) continue; if (u == pp->index) continue; if (gsl->offset + gsl->length <= gsl2->offset) continue; if (gsl2->offset + gsl2->length <= gsl->offset) continue; /* overlap */ pp2 = gsl2->provider; if ((pp->acw + dw) > 0 && pp2->ace > 0) return (EPERM); if ((pp->ace + de) > 0 && pp2->acw > 0) return (EPERM); } } /* On first open, grab an extra "exclusive" bit */ if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) de++; /* ... and let go of it on last close */ if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) de--; error = g_access(cp, dr, dw, de); /* * Free the softc if all providers have been closed and this geom * is being removed. */ if (error == 0 && (gp->flags & G_GEOM_WITHER) != 0 && (cp->acr + cp->acw + cp->ace) == 0) g_slice_free(gp); return (error); } /* * XXX: It should be possible to specify here if we should finish all of the * XXX: bio, or only the non-hot bits. This would get messy if there were * XXX: two hot spots in the same bio, so for now we simply finish off the * XXX: entire bio. Modifying hot data on the way to disk is frowned on * XXX: so making that considerably harder is not a bad idea anyway. */ void g_slice_finish_hot(struct bio *bp) { struct bio *bp2; struct g_geom *gp; struct g_consumer *cp; struct g_slicer *gsp; struct g_slice *gsl; int idx; KASSERT(bp->bio_to != NULL, ("NULL bio_to in g_slice_finish_hot(%p)", bp)); KASSERT(bp->bio_from != NULL, ("NULL bio_from in g_slice_finish_hot(%p)", bp)); gp = bp->bio_to->geom; gsp = gp->softc; cp = LIST_FIRST(&gp->consumer); KASSERT(cp != NULL, ("NULL consumer in g_slice_finish_hot(%p)", bp)); idx = bp->bio_to->index; gsl = &gsp->slices[idx]; bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } if (bp2->bio_offset + bp2->bio_length > gsl->length) bp2->bio_length = gsl->length - bp2->bio_offset; bp2->bio_done = g_std_done; bp2->bio_offset += gsl->offset; g_io_request(bp2, cp); return; } static void g_slice_done(struct bio *bp) { KASSERT(bp->bio_cmd == BIO_GETATTR && strcmp(bp->bio_attribute, "GEOM::ident") == 0, ("bio_cmd=0x%x bio_attribute=%s", bp->bio_cmd, bp->bio_attribute)); if (bp->bio_error == 0 && bp->bio_data[0] != '\0') { char idx[8]; /* Add index to the ident received. */ snprintf(idx, sizeof(idx), "s%d", bp->bio_parent->bio_to->index); if (strlcat(bp->bio_data, idx, bp->bio_length) >= bp->bio_length) { bp->bio_error = EFAULT; } } g_std_done(bp); } static void g_slice_start(struct bio *bp) { struct bio *bp2; struct g_provider *pp; struct g_geom *gp; struct g_consumer *cp; struct g_slicer *gsp; struct g_slice *gsl; struct g_slice_hot *ghp; int idx, error; u_int m_index; off_t t; pp = bp->bio_to; gp = pp->geom; gsp = gp->softc; cp = LIST_FIRST(&gp->consumer); idx = pp->index; gsl = &gsp->slices[idx]; switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: if (bp->bio_offset > gsl->length) { g_io_deliver(bp, EINVAL); /* XXX: EWHAT ? */ return; } /* * Check if we collide with any hot spaces, and call the * method once if so. */ t = bp->bio_offset + gsl->offset; for (m_index = 0; m_index < gsp->nhotspot; m_index++) { ghp = &gsp->hotspot[m_index]; if (t >= ghp->offset + ghp->length) continue; if (t + bp->bio_length <= ghp->offset) continue; switch(bp->bio_cmd) { case BIO_READ: idx = ghp->ract; break; case BIO_WRITE: idx = ghp->wact; break; case BIO_DELETE: idx = ghp->dact; break; } switch(idx) { case G_SLICE_HOT_ALLOW: /* Fall out and continue normal processing */ continue; case G_SLICE_HOT_DENY: g_io_deliver(bp, EROFS); return; case G_SLICE_HOT_START: error = gsp->start(bp); if (error && error != EJUSTRETURN) g_io_deliver(bp, error); return; case G_SLICE_HOT_CALL: error = g_post_event(gsp->hot, bp, M_NOWAIT, gp, NULL); if (error) g_io_deliver(bp, error); return; } break; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } if (bp2->bio_offset + bp2->bio_length > gsl->length) bp2->bio_length = gsl->length - bp2->bio_offset; bp2->bio_done = g_std_done; bp2->bio_offset += gsl->offset; g_io_request(bp2, cp); return; case BIO_GETATTR: /* Give the real method a chance to override */ if (gsp->start != NULL && gsp->start(bp)) return; if (!strcmp("GEOM::ident", bp->bio_attribute)) { bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_slice_done; g_io_request(bp2, cp); return; } if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) { struct g_kerneldump *gkd; gkd = (struct g_kerneldump *)bp->bio_data; gkd->offset += gsp->slices[idx].offset; if (gkd->length > gsp->slices[idx].length) gkd->length = gsp->slices[idx].length; /* now, pass it on downwards... */ } /* FALLTHROUGH */ case BIO_FLUSH: bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_std_done; g_io_request(bp2, cp); break; default: g_io_deliver(bp, EOPNOTSUPP); return; } } void g_slice_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_slicer *gsp; gsp = gp->softc; if (indent == NULL) { sbuf_printf(sb, " i %u", pp->index); sbuf_printf(sb, " o %ju", (uintmax_t)gsp->slices[pp->index].offset); return; } if (pp != NULL) { sbuf_printf(sb, "%s%u\n", indent, pp->index); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)gsp->slices[pp->index].length); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)gsp->slices[pp->index].length / 512); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)gsp->slices[pp->index].offset); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)gsp->slices[pp->index].offset / 512); } } int g_slice_config(struct g_geom *gp, u_int idx, int how, off_t offset, off_t length, u_int sectorsize, const char *fmt, ...) { struct g_provider *pp, *pp2; struct g_slicer *gsp; struct g_slice *gsl; va_list ap; struct sbuf *sb; int acc; g_trace(G_T_TOPOLOGY, "g_slice_config(%s, %d, %d)", gp->name, idx, how); g_topology_assert(); gsp = gp->softc; if (idx >= gsp->nslice) return(EINVAL); gsl = &gsp->slices[idx]; pp = gsl->provider; if (pp != NULL) acc = pp->acr + pp->acw + pp->ace; else acc = 0; if (acc != 0 && how != G_SLICE_CONFIG_FORCE) { if (length < gsl->length) return(EBUSY); if (offset != gsl->offset) return(EBUSY); } /* XXX: check offset + length <= MEDIASIZE */ if (how == G_SLICE_CONFIG_CHECK) return (0); gsl->length = length; gsl->offset = offset; gsl->sectorsize = sectorsize; if (length == 0) { if (pp == NULL) return (0); if (bootverbose) printf("GEOM: Deconfigure %s\n", pp->name); g_wither_provider(pp, ENXIO); gsl->provider = NULL; gsp->nprovider--; return (0); } if (pp != NULL) { if (bootverbose) printf("GEOM: Reconfigure %s, start %jd length %jd end %jd\n", pp->name, (intmax_t)offset, (intmax_t)length, (intmax_t)(offset + length - 1)); g_resize_provider(pp, gsl->length); return (0); } sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); pp = g_new_providerf(gp, "%s", sbuf_data(sb)); pp2 = LIST_FIRST(&gp->consumer)->provider; pp->stripesize = pp2->stripesize; pp->stripeoffset = pp2->stripeoffset + offset; if (pp->stripesize > 0) pp->stripeoffset %= pp->stripesize; if (gsp->nhotspot == 0) { pp->flags |= pp2->flags & G_PF_ACCEPT_UNMAPPED; pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; } if (0 && bootverbose) printf("GEOM: Configure %s, start %jd length %jd end %jd\n", pp->name, (intmax_t)offset, (intmax_t)length, (intmax_t)(offset + length - 1)); pp->index = idx; pp->mediasize = gsl->length; pp->sectorsize = gsl->sectorsize; gsl->provider = pp; gsp->nprovider++; g_error_provider(pp, 0); sbuf_delete(sb); return(0); } /* * Configure "hotspots". A hotspot is a piece of the parent device which * this particular slicer cares about for some reason. Typically because * it contains meta-data used to configure the slicer. * A hotspot is identified by its index number. The offset and length are * relative to the parent device, and the three "?act" fields specify * what action to take on BIO_READ, BIO_DELETE and BIO_WRITE. * * XXX: There may be a race relative to g_slice_start() here, if an existing * XXX: hotspot is changed wile I/O is happening. Should this become a problem * XXX: we can protect the hotspot stuff with a mutex. */ int g_slice_conf_hot(struct g_geom *gp, u_int idx, off_t offset, off_t length, int ract, int dact, int wact) { struct g_slicer *gsp; struct g_slice_hot *gsl, *gsl2; struct g_consumer *cp; struct g_provider *pp; g_trace(G_T_TOPOLOGY, "g_slice_conf_hot(%s, idx: %d, off: %jd, len: %jd)", gp->name, idx, (intmax_t)offset, (intmax_t)length); g_topology_assert(); gsp = gp->softc; /* Deny unmapped I/O and direct dispatch if hotspots are used. */ if (gsp->nhotspot == 0) { LIST_FOREACH(pp, &gp->provider, provider) pp->flags &= ~(G_PF_ACCEPT_UNMAPPED | G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE); LIST_FOREACH(cp, &gp->consumer, consumer) cp->flags &= ~(G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE); } gsl = gsp->hotspot; if(idx >= gsp->nhotspot) { gsl2 = g_malloc((idx + 1) * sizeof *gsl2, M_WAITOK | M_ZERO); if (gsp->hotspot != NULL) bcopy(gsp->hotspot, gsl2, gsp->nhotspot * sizeof *gsl2); gsp->hotspot = gsl2; if (gsp->hotspot != NULL) g_free(gsl); gsl = gsl2; gsp->nhotspot = idx + 1; } gsl[idx].offset = offset; gsl[idx].length = length; KASSERT(!((ract | dact | wact) & G_SLICE_HOT_START) || gsp->start != NULL, ("G_SLICE_HOT_START but no slice->start")); /* XXX: check that we _have_ a start function if HOT_START specified */ gsl[idx].ract = ract; gsl[idx].dact = dact; gsl[idx].wact = wact; return (0); } void g_slice_orphan(struct g_consumer *cp) { struct g_geom *gp; g_topology_assert(); gp = cp->geom; g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, gp->name); g_wither_geom(gp, ENXIO); /* * We can safely free the softc now if there are no accesses, * otherwise g_slice_access() will do that after the last close. */ if ((cp->acr + cp->acw + cp->ace) == 0) g_slice_free(gp); } void g_slice_spoiled(struct g_consumer *cp) { g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, cp->geom->name); cp->flags |= G_CF_ORPHAN; g_slice_orphan(cp); } int g_slice_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { g_slice_spoiled(LIST_FIRST(&gp->consumer)); return (0); } struct g_geom * g_slice_new(struct g_class *mp, u_int slices, struct g_provider *pp, struct g_consumer **cpp, void *extrap, int extra, g_slice_start_t *start) { struct g_geom *gp; struct g_slicer *gsp; struct g_consumer *cp; void **vp; int error; g_topology_assert(); vp = (void **)extrap; gp = g_new_geomf(mp, "%s", pp->name); gsp = g_slice_alloc(slices, extra); gsp->start = start; gp->softc = gsp; gp->start = g_slice_start; gp->access = g_slice_access; gp->orphan = g_slice_orphan; gp->spoiled = g_slice_spoiled; if (gp->dumpconf == NULL) gp->dumpconf = g_slice_dumpconf; if (gp->class->destroy_geom == NULL) gp->class->destroy_geom = g_slice_destroy_geom; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error) { g_wither_geom(gp, ENXIO); return (NULL); } if (extrap != NULL) *vp = gsp->softc; *cpp = cp; return (gp); } Index: head/sys/geom/geom_slice.h =================================================================== --- head/sys/geom/geom_slice.h (revision 326269) +++ head/sys/geom/geom_slice.h (revision 326270) @@ -1,90 +1,92 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_SLICE_H_ #define _GEOM_GEOM_SLICE_H_ struct g_slice { off_t offset; off_t length; u_int sectorsize; struct g_provider *provider; }; struct g_slice_hot { off_t offset; off_t length; int ract; int dact; int wact; }; typedef int g_slice_start_t (struct bio *bp); struct g_slicer { u_int nslice; u_int nprovider; struct g_slice *slices; u_int nhotspot; struct g_slice_hot *hotspot; void *softc; g_slice_start_t *start; g_event_t *hot; }; g_dumpconf_t g_slice_dumpconf; int g_slice_config(struct g_geom *gp, u_int idx, int how, off_t offset, off_t length, u_int sectorsize, const char *fmt, ...) __printflike(7, 8); void g_slice_spoiled(struct g_consumer *cp); void g_slice_orphan(struct g_consumer *cp); #define G_SLICE_CONFIG_CHECK 0 #define G_SLICE_CONFIG_SET 1 #define G_SLICE_CONFIG_FORCE 2 struct g_geom * g_slice_new(struct g_class *mp, u_int slices, struct g_provider *pp, struct g_consumer **cpp, void *extrap, int extra, g_slice_start_t *start); int g_slice_conf_hot(struct g_geom *gp, u_int idx, off_t offset, off_t length, int ract, int dact, int wact); #define G_SLICE_HOT_ALLOW 1 #define G_SLICE_HOT_DENY 2 #define G_SLICE_HOT_START 4 #define G_SLICE_HOT_CALL 8 int g_slice_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); void g_slice_finish_hot(struct bio *bp); #endif /* _GEOM_GEOM_SLICE_H_ */ Index: head/sys/geom/geom_subr.c =================================================================== --- head/sys/geom/geom_subr.c (revision 326269) +++ head/sys/geom/geom_subr.c (revision 326270) @@ -1,1570 +1,1572 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KDB #include #endif struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes); static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms); char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim; struct g_hh00 { struct g_class *mp; struct g_provider *pp; off_t size; int error; int post; }; /* * This event offers a new class a chance to taste all preexisting providers. */ static void g_load_class(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp2, *mp; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name); KASSERT(mp->name != NULL && *mp->name != '\0', ("GEOM class has no name")); LIST_FOREACH(mp2, &g_classes, class) { if (mp2 == mp) { printf("The GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } else if (strcmp(mp2->name, mp->name) == 0) { printf("A GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } } LIST_INIT(&mp->geom); LIST_INSERT_HEAD(&g_classes, mp, class); if (mp->init != NULL) mp->init(mp); if (mp->taste == NULL) return; LIST_FOREACH(mp2, &g_classes, class) { if (mp == mp2) continue; LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { mp->taste(mp, pp, 0); g_topology_assert(); } } } } static int g_unload_class(struct g_class *mp) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; int error; g_topology_lock(); g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name); retry: G_VALID_CLASS(mp); LIST_FOREACH(gp, &mp->geom, geom) { /* We refuse to unload if anything is open */ LIST_FOREACH(pp, &gp->provider, provider) if (pp->acr || pp->acw || pp->ace) { g_topology_unlock(); return (EBUSY); } LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) { g_topology_unlock(); return (EBUSY); } /* If the geom is withering, wait for it to finish. */ if (gp->flags & G_GEOM_WITHER) { g_topology_sleep(mp, 1); goto retry; } } /* * We allow unloading if we have no geoms, or a class * method we can use to get rid of them. */ if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) { g_topology_unlock(); return (EOPNOTSUPP); } /* Bar new entries */ mp->taste = NULL; mp->config = NULL; LIST_FOREACH(gp, &mp->geom, geom) { error = mp->destroy_geom(NULL, mp, gp); if (error != 0) { g_topology_unlock(); return (error); } } /* Wait for withering to finish. */ for (;;) { gp = LIST_FIRST(&mp->geom); if (gp == NULL) break; KASSERT(gp->flags & G_GEOM_WITHER, ("Non-withering geom in class %s", mp->name)); g_topology_sleep(mp, 1); } G_VALID_CLASS(mp); if (mp->fini != NULL) mp->fini(mp); LIST_REMOVE(mp, class); g_topology_unlock(); return (0); } int g_modevent(module_t mod, int type, void *data) { struct g_hh00 *hh; int error; static int g_ignition; struct g_class *mp; mp = data; if (mp->version != G_VERSION) { printf("GEOM class %s has Wrong version %x\n", mp->name, mp->version); return (EINVAL); } if (!g_ignition) { g_ignition++; g_init(); } error = EOPNOTSUPP; switch (type) { case MOD_LOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; /* * Once the system is not cold, MOD_LOAD calls will be * from the userland and the g_event thread will be able * to acknowledge their completion. */ if (cold) { hh->post = 1; error = g_post_event(g_load_class, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_load_class, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } break; case MOD_UNLOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name); error = g_unload_class(mp); if (error == 0) { KASSERT(LIST_EMPTY(&mp->geom), ("Unloaded class (%s) still has geom", mp->name)); } break; } return (error); } static void g_retaste_event(void *arg, int flag) { struct g_class *mp, *mp2; struct g_geom *gp; struct g_hh00 *hh; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown || g_notaste) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name); LIST_FOREACH(mp2, &g_classes, class) { LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr || pp->acw || pp->ace) continue; LIST_FOREACH(cp, &pp->consumers, consumers) { if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; } if (cp != NULL) { cp->flags |= G_CF_ORPHAN; g_wither_geom(cp->geom, ENXIO); } mp->taste(mp, pp, 0); g_topology_assert(); } } } } int g_retaste(struct g_class *mp) { struct g_hh00 *hh; int error; if (mp->taste == NULL) return (EINVAL); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; if (cold) { hh->post = 1; error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } return (error); } struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) { struct g_geom *gp; va_list ap; struct sbuf *sb; g_topology_assert(); G_VALID_CLASS(mp); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO); gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO); gp->class = mp; gp->rank = 1; LIST_INIT(&gp->consumer); LIST_INIT(&gp->provider); LIST_INIT(&gp->aliases); LIST_INSERT_HEAD(&mp->geom, gp, geom); TAILQ_INSERT_HEAD(&geoms, gp, geoms); strcpy(gp->name, sbuf_data(sb)); sbuf_delete(sb); /* Fill in defaults from class */ gp->start = mp->start; gp->spoiled = mp->spoiled; gp->attrchanged = mp->attrchanged; gp->providergone = mp->providergone; gp->dumpconf = mp->dumpconf; gp->access = mp->access; gp->orphan = mp->orphan; gp->ioctl = mp->ioctl; gp->resize = mp->resize; return (gp); } void g_destroy_geom(struct g_geom *gp) { struct g_geom_alias *gap, *gaptmp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name); KASSERT(LIST_EMPTY(&gp->consumer), ("g_destroy_geom(%s) with consumer(s) [%p]", gp->name, LIST_FIRST(&gp->consumer))); KASSERT(LIST_EMPTY(&gp->provider), ("g_destroy_geom(%s) with provider(s) [%p]", gp->name, LIST_FIRST(&gp->provider))); g_cancel_event(gp); LIST_REMOVE(gp, geom); TAILQ_REMOVE(&geoms, gp, geoms); LIST_FOREACH_SAFE(gap, &gp->aliases, ga_next, gaptmp) g_free(gap); g_free(gp->name); g_free(gp); } /* * This function is called (repeatedly) until the geom has withered away. */ void g_wither_geom(struct g_geom *gp, int error) { struct g_provider *pp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name); if (!(gp->flags & G_GEOM_WITHER)) { gp->flags |= G_GEOM_WITHER; LIST_FOREACH(pp, &gp->provider, provider) if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } g_do_wither(); } /* * Convenience function to destroy a particular provider. */ void g_wither_provider(struct g_provider *pp, int error) { pp->flags |= G_PF_WITHER; if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } /* * This function is called (repeatedly) until the has withered away. */ void g_wither_geom_close(struct g_geom *gp, int error) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name); LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_wither_geom(gp, error); } /* * This function is called (repeatedly) until we cant wash away more * withered bits at present. */ void g_wither_washer() { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp, *pp2; struct g_consumer *cp, *cp2; g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (!(pp->flags & G_PF_WITHER)) continue; if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } if (!(gp->flags & G_GEOM_WITHER)) continue; LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) { if (cp->acr || cp->acw || cp->ace) continue; if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); } } } struct g_consumer * g_new_consumer(struct g_geom *gp) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(!(gp->flags & G_GEOM_WITHER), ("g_new_consumer on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); KASSERT(gp->orphan != NULL, ("g_new_consumer on geom(%s) (class %s) without orphan", gp->name, gp->class->name)); cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO); cp->geom = gp; cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->consumer, cp, consumer); return(cp); } void g_destroy_consumer(struct g_consumer *cp) { struct g_geom *gp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp); KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached")); KASSERT (cp->acr == 0, ("g_destroy_consumer with acr")); KASSERT (cp->acw == 0, ("g_destroy_consumer with acw")); KASSERT (cp->ace == 0, ("g_destroy_consumer with ace")); g_cancel_event(cp); gp = cp->geom; LIST_REMOVE(cp, consumer); devstat_remove_entry(cp->stat); g_free(cp); if (gp->flags & G_GEOM_WITHER) g_do_wither(); } static void g_new_provider_event(void *arg, int flag) { struct g_class *mp; struct g_provider *pp; struct g_consumer *cp, *next_cp; g_topology_assert(); if (flag == EV_CANCEL) return; if (g_shutdown) return; pp = arg; G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_new_provider_event but withered")); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { if ((cp->flags & G_CF_ORPHAN) == 0 && cp->geom->attrchanged != NULL) cp->geom->attrchanged(cp, "GEOM::media"); } if (g_notaste) return; LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) { struct g_provider *pp; struct sbuf *sb; va_list ap; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(gp->access != NULL, ("new provider on geom(%s) without ->access (class %s)", gp->name, gp->class->name)); KASSERT(gp->start != NULL, ("new provider on geom(%s) without ->start (class %s)", gp->name, gp->class->name)); KASSERT(!(gp->flags & G_GEOM_WITHER), ("new provider on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); pp->name = (char *)(pp + 1); strcpy(pp->name, sbuf_data(sb)); sbuf_delete(sb); LIST_INIT(&pp->consumers); pp->error = ENXIO; pp->geom = gp; pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->provider, pp, provider); g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL); return (pp); } void g_error_provider(struct g_provider *pp, int error) { /* G_VALID_PROVIDER(pp); We may not have g_topology */ pp->error = error; } static void g_resize_provider_event(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; off_t size; g_topology_assert(); if (g_shutdown) return; hh = arg; pp = hh->pp; size = hh->size; g_free(hh); G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_resize_provider_event but withered")); g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if (gp->resize == NULL && size < pp->mediasize) { /* * XXX: g_dev_orphan method does deferred destroying * and it is possible, that other event could already * call the orphan method. Check consumer's flags to * do not schedule it twice. */ if (cp->flags & G_CF_ORPHAN) continue; cp->flags |= G_CF_ORPHAN; cp->geom->orphan(cp); } } pp->mediasize = size; LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if ((gp->flags & G_GEOM_WITHER) == 0 && gp->resize != NULL) gp->resize(cp); } /* * After resizing, the previously invalid GEOM class metadata * might become valid. This means we should retaste. */ LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } void g_resize_provider(struct g_provider *pp, off_t size) { struct g_hh00 *hh; G_VALID_PROVIDER(pp); if (pp->flags & G_PF_WITHER) return; if (size == pp->mediasize) return; hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->pp = pp; hh->size = size; g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL); } #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif struct g_provider * g_provider_by_name(char const *arg) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp, *wpp; if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) arg += sizeof(_PATH_DEV) - 1; wpp = NULL; LIST_FOREACH(cp, &g_classes, class) { LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (strcmp(arg, pp->name) != 0) continue; if ((gp->flags & G_GEOM_WITHER) == 0 && (pp->flags & G_PF_WITHER) == 0) return (pp); else wpp = pp; } } } return (wpp); } void g_destroy_provider(struct g_provider *pp) { struct g_geom *gp; g_topology_assert(); G_VALID_PROVIDER(pp); KASSERT(LIST_EMPTY(&pp->consumers), ("g_destroy_provider but attached")); KASSERT (pp->acr == 0, ("g_destroy_provider with acr")); KASSERT (pp->acw == 0, ("g_destroy_provider with acw")); KASSERT (pp->ace == 0, ("g_destroy_provider with ace")); g_cancel_event(pp); LIST_REMOVE(pp, provider); gp = pp->geom; devstat_remove_entry(pp->stat); /* * If a callback was provided, send notification that the provider * is now gone. */ if (gp->providergone != NULL) gp->providergone(pp); g_free(pp); if ((gp->flags & G_GEOM_WITHER)) g_do_wither(); } /* * We keep the "geoms" list sorted by topological order (== increasing * numerical rank) at all times. * When an attach is done, the attaching geoms rank is invalidated * and it is moved to the tail of the list. * All geoms later in the sequence has their ranks reevaluated in * sequence. If we cannot assign rank to a geom because it's * prerequisites do not have rank, we move that element to the tail * of the sequence with invalid rank as well. * At some point we encounter our original geom and if we stil fail * to assign it a rank, there must be a loop and we fail back to * g_attach() which detach again and calls redo_rank again * to fix up the damage. * It would be much simpler code wise to do it recursively, but we * can't risk that on the kernel stack. */ static int redo_rank(struct g_geom *gp) { struct g_consumer *cp; struct g_geom *gp1, *gp2; int n, m; g_topology_assert(); G_VALID_GEOM(gp); /* Invalidate this geoms rank and move it to the tail */ gp1 = TAILQ_NEXT(gp, geoms); if (gp1 != NULL) { gp->rank = 0; TAILQ_REMOVE(&geoms, gp, geoms); TAILQ_INSERT_TAIL(&geoms, gp, geoms); } else { gp1 = gp; } /* re-rank the rest of the sequence */ for (; gp1 != NULL; gp1 = gp2) { gp1->rank = 0; m = 1; LIST_FOREACH(cp, &gp1->consumer, consumer) { if (cp->provider == NULL) continue; n = cp->provider->geom->rank; if (n == 0) { m = 0; break; } else if (n >= m) m = n + 1; } gp1->rank = m; gp2 = TAILQ_NEXT(gp1, geoms); /* got a rank, moving on */ if (m != 0) continue; /* no rank to original geom means loop */ if (gp == gp1) return (ELOOP); /* no rank, put it at the end move on */ TAILQ_REMOVE(&geoms, gp1, geoms); TAILQ_INSERT_TAIL(&geoms, gp1, geoms); } return (0); } int g_attach(struct g_consumer *cp, struct g_provider *pp) { int error; g_topology_assert(); G_VALID_CONSUMER(cp); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp); KASSERT(cp->provider == NULL, ("attach but attached")); cp->provider = pp; cp->flags &= ~G_CF_ORPHAN; LIST_INSERT_HEAD(&pp->consumers, cp, consumers); error = redo_rank(cp->geom); if (error) { LIST_REMOVE(cp, consumers); cp->provider = NULL; redo_rank(cp->geom); } return (error); } void g_detach(struct g_consumer *cp) { struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp); KASSERT(cp->provider != NULL, ("detach but not attached")); KASSERT(cp->acr == 0, ("detach but nonzero acr")); KASSERT(cp->acw == 0, ("detach but nonzero acw")); KASSERT(cp->ace == 0, ("detach but nonzero ace")); KASSERT(cp->nstart == cp->nend, ("detach with active requests")); pp = cp->provider; LIST_REMOVE(cp, consumers); cp->provider = NULL; if ((cp->geom->flags & G_GEOM_WITHER) || (pp->geom->flags & G_GEOM_WITHER) || (pp->flags & G_PF_WITHER)) g_do_wither(); redo_rank(cp->geom); } /* * g_access() * * Access-check with delta values. The question asked is "can provider * "cp" change the access counters by the relative amounts dc[rwe] ?" */ int g_access(struct g_consumer *cp, int dcr, int dcw, int dce) { struct g_provider *pp; int pr,pw,pe; int error; g_topology_assert(); G_VALID_CONSUMER(cp); pp = cp->provider; KASSERT(pp != NULL, ("access but not attached")); G_VALID_PROVIDER(pp); g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)", cp, pp->name, dcr, dcw, dce); KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr")); KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw")); KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace")); KASSERT(dcr != 0 || dcw != 0 || dce != 0, ("NOP access request")); KASSERT(pp->geom->access != NULL, ("NULL geom->access")); /* * If our class cares about being spoiled, and we have been, we * are probably just ahead of the event telling us that. Fail * now rather than having to unravel this later. */ if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) && (dcr > 0 || dcw > 0 || dce > 0)) return (ENXIO); /* * Figure out what counts the provider would have had, if this * consumer had (r0w0e0) at this time. */ pr = pp->acr - cp->acr; pw = pp->acw - cp->acw; pe = pp->ace - cp->ace; g_trace(G_T_ACCESS, "open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)", dcr, dcw, dce, cp->acr, cp->acw, cp->ace, pp->acr, pp->acw, pp->ace, pp, pp->name); /* If foot-shooting is enabled, any open on rank#1 is OK */ if ((g_debugflags & 16) && pp->geom->rank == 1) ; /* If we try exclusive but already write: fail */ else if (dce > 0 && pw > 0) return (EPERM); /* If we try write but already exclusive: fail */ else if (dcw > 0 && pe > 0) return (EPERM); /* If we try to open more but provider is error'ed: fail */ else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) { printf("%s(%d): provider %s has error %d set\n", __func__, __LINE__, pp->name, pp->error); return (pp->error); } /* Ok then... */ error = pp->geom->access(pp, dcr, dcw, dce); KASSERT(dcr > 0 || dcw > 0 || dce > 0 || error == 0, ("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed " "closing ->access()", pp->geom->class->name, pp->name, dcr, dcw, dce, error)); if (!error) { /* * If we open first write, spoil any partner consumers. * If we close last write and provider is not errored, * trigger re-taste. */ if (pp->acw == 0 && dcw != 0) g_spoil(pp, cp); else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); pp->acr += dcr; pp->acw += dcw; pp->ace += dce; cp->acr += dcr; cp->acw += dcw; cp->ace += dce; if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) KASSERT(pp->sectorsize > 0, ("Provider %s lacks sectorsize", pp->name)); if ((cp->geom->flags & G_GEOM_WITHER) && cp->acr == 0 && cp->acw == 0 && cp->ace == 0) g_do_wither(); } return (error); } int g_handleattr_int(struct bio *bp, const char *attribute, int val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_str(struct bio *bp, const char *attribute, const char *str) { return (g_handleattr(bp, attribute, str, 0)); } int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len) { int error = 0; if (strcmp(bp->bio_attribute, attribute)) return (0); if (len == 0) { bzero(bp->bio_data, bp->bio_length); if (strlcpy(bp->bio_data, val, bp->bio_length) >= bp->bio_length) { printf("%s: %s bio_length %jd len %zu -> EFAULT\n", __func__, bp->bio_to->name, (intmax_t)bp->bio_length, strlen(val)); error = EFAULT; } } else if (bp->bio_length == len) { bcopy(val, bp->bio_data, len); } else { printf("%s: %s bio_length %jd len %d -> EFAULT\n", __func__, bp->bio_to->name, (intmax_t)bp->bio_length, len); error = EFAULT; } if (error == 0) bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); return (1); } int g_std_access(struct g_provider *pp, int dr __unused, int dw __unused, int de __unused) { g_topology_assert(); G_VALID_PROVIDER(pp); return (0); } void g_std_done(struct bio *bp) { struct bio *bp2; bp2 = bp->bio_parent; if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; g_destroy_bio(bp); bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) g_io_deliver(bp2, bp2->bio_error); } /* XXX: maybe this is only g_slice_spoiled */ void g_std_spoiled(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp); cp->flags |= G_CF_ORPHAN; g_detach(cp); gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_orphan_provider(pp, ENXIO); g_destroy_consumer(cp); if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); else gp->flags |= G_GEOM_WITHER; } /* * Spoiling happens when a provider is opened for writing, but consumers * which are configured by in-band data are attached (slicers for instance). * Since the write might potentially change the in-band data, such consumers * need to re-evaluate their existence after the writing session closes. * We do this by (offering to) tear them down when the open for write happens * in return for a re-taste when it closes again. * Together with the fact that such consumers grab an 'e' bit whenever they * are open, regardless of mode, this ends up DTRT. */ static void g_spoil_event(void *arg, int flag) { struct g_provider *pp; struct g_consumer *cp, *cp2; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp, pp->geom->class->name, pp->geom->name, pp->name); for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) { cp2 = LIST_NEXT(cp, consumers); if ((cp->flags & G_CF_SPOILED) == 0) continue; cp->flags &= ~G_CF_SPOILED; if (cp->geom->spoiled == NULL) continue; cp->geom->spoiled(cp); g_topology_assert(); } } void g_spoil(struct g_provider *pp, struct g_consumer *cp) { struct g_consumer *cp2; g_topology_assert(); G_VALID_PROVIDER(pp); G_VALID_CONSUMER(cp); LIST_FOREACH(cp2, &pp->consumers, consumers) { if (cp2 == cp) continue; /* KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr)); KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw)); */ KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace)); cp2->flags |= G_CF_SPOILED; } g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL); } static void g_media_changed_event(void *arg, int flag) { struct g_provider *pp; int retaste; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); /* * If provider was not open for writing, queue retaste after spoiling. * If it was, retaste will happen automatically on close. */ retaste = (pp->acw == 0 && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)); g_spoil_event(arg, flag); if (retaste) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); } int g_media_changed(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_media_changed_event, pp, flag, pp, NULL)); } int g_media_gone(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_spoil_event, pp, flag, pp, NULL)); } int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len) { int error, i; i = len; error = g_io_getattr(attr, cp, &i, var); if (error) return (error); if (i != len) return (EINVAL); return (0); } static int g_get_device_prefix_len(const char *name) { int len; if (strncmp(name, "ada", 3) == 0) len = 3; else if (strncmp(name, "ad", 2) == 0) len = 2; else return (0); if (name[len] < '0' || name[len] > '9') return (0); do { len++; } while (name[len] >= '0' && name[len] <= '9'); return (len); } int g_compare_names(const char *namea, const char *nameb) { int deva, devb; if (strcmp(namea, nameb) == 0) return (1); deva = g_get_device_prefix_len(namea); if (deva == 0) return (0); devb = g_get_device_prefix_len(nameb); if (devb == 0) return (0); if (strcmp(namea + deva, nameb + devb) == 0) return (1); return (0); } void g_geom_add_alias(struct g_geom *gp, const char *alias) { struct g_geom_alias *gap; gap = (struct g_geom_alias *)g_malloc( sizeof(struct g_geom_alias) + strlen(alias) + 1, M_WAITOK); strcpy((char *)(gap + 1), alias); gap->ga_alias = (const char *)(gap + 1); LIST_INSERT_HEAD(&gp->aliases, gap, ga_next); } #if defined(DIAGNOSTIC) || defined(DDB) /* * This function walks the mesh and returns a non-zero integer if it * finds the argument pointer is an object. The return value indicates * which type of object it is believed to be. If topology is not locked, * this function is potentially dangerous, but we don't assert that the * topology lock is held when called from debugger. */ int g_valid_obj(void const *ptr) { struct g_class *mp; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; #ifdef KDB if (kdb_active == 0) #endif g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { if (ptr == mp) return (1); LIST_FOREACH(gp, &mp->geom, geom) { if (ptr == gp) return (2); LIST_FOREACH(cp, &gp->consumer, consumer) if (ptr == cp) return (3); LIST_FOREACH(pp, &gp->provider, provider) if (ptr == pp) return (4); } } return(0); } #endif #ifdef DDB #define gprintf(...) do { \ db_printf("%*s", indent, ""); \ db_printf(__VA_ARGS__); \ } while (0) #define gprintln(...) do { \ gprintf(__VA_ARGS__); \ db_printf("\n"); \ } while (0) #define ADDFLAG(obj, flag, sflag) do { \ if ((obj)->flags & (flag)) { \ if (comma) \ strlcat(str, ",", size); \ strlcat(str, (sflag), size); \ comma = 1; \ } \ } while (0) static char * provider_flags_to_string(struct g_provider *pp, char *str, size_t size) { int comma = 0; bzero(str, size); if (pp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER"); ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN"); return (str); } static char * geom_flags_to_string(struct g_geom *gp, char *str, size_t size) { int comma = 0; bzero(str, size); if (gp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER"); return (str); } static void db_show_geom_consumer(int indent, struct g_consumer *cp) { if (indent == 0) { gprintln("consumer: %p", cp); gprintln(" class: %s (%p)", cp->geom->class->name, cp->geom->class); gprintln(" geom: %s (%p)", cp->geom->name, cp->geom); if (cp->provider == NULL) gprintln(" provider: none"); else { gprintln(" provider: %s (%p)", cp->provider->name, cp->provider); } gprintln(" access: r%dw%de%d", cp->acr, cp->acw, cp->ace); gprintln(" flags: 0x%04x", cp->flags); gprintln(" nstart: %u", cp->nstart); gprintln(" nend: %u", cp->nend); } else { gprintf("consumer: %p (%s), access=r%dw%de%d", cp, cp->provider != NULL ? cp->provider->name : "none", cp->acr, cp->acw, cp->ace); if (cp->flags) db_printf(", flags=0x%04x", cp->flags); db_printf("\n"); } } static void db_show_geom_provider(int indent, struct g_provider *pp) { struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("provider: %s (%p)", pp->name, pp); gprintln(" class: %s (%p)", pp->geom->class->name, pp->geom->class); gprintln(" geom: %s (%p)", pp->geom->name, pp->geom); gprintln(" mediasize: %jd", (intmax_t)pp->mediasize); gprintln(" sectorsize: %u", pp->sectorsize); gprintln(" stripesize: %u", pp->stripesize); gprintln(" stripeoffset: %u", pp->stripeoffset); gprintln(" access: r%dw%de%d", pp->acr, pp->acw, pp->ace); gprintln(" flags: %s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); gprintln(" error: %d", pp->error); gprintln(" nstart: %u", pp->nstart); gprintln(" nend: %u", pp->nend); if (LIST_EMPTY(&pp->consumers)) gprintln(" consumers: none"); } else { gprintf("provider: %s (%p), access=r%dw%de%d", pp->name, pp, pp->acr, pp->acw, pp->ace); if (pp->flags != 0) { db_printf(", flags=%s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&pp->consumers)) { LIST_FOREACH(cp, &pp->consumers, consumers) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_geom(int indent, struct g_geom *gp) { struct g_provider *pp; struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("geom: %s (%p)", gp->name, gp); gprintln(" class: %s (%p)", gp->class->name, gp->class); gprintln(" flags: %s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); gprintln(" rank: %d", gp->rank); if (LIST_EMPTY(&gp->provider)) gprintln(" providers: none"); if (LIST_EMPTY(&gp->consumer)) gprintln(" consumers: none"); } else { gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank); if (gp->flags != 0) { db_printf(", flags=%s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&gp->provider)) { LIST_FOREACH(pp, &gp->provider, provider) { db_show_geom_provider(indent + 2, pp); if (db_pager_quit) break; } } if (!LIST_EMPTY(&gp->consumer)) { LIST_FOREACH(cp, &gp->consumer, consumer) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_class(struct g_class *mp) { struct g_geom *gp; db_printf("class: %s (%p)\n", mp->name, mp); LIST_FOREACH(gp, &mp->geom, geom) { db_show_geom_geom(2, gp); if (db_pager_quit) break; } } /* * Print the GEOM topology or the given object. */ DB_SHOW_COMMAND(geom, db_show_geom) { struct g_class *mp; if (!have_addr) { /* No address given, print the entire topology. */ LIST_FOREACH(mp, &g_classes, class) { db_show_geom_class(mp); db_printf("\n"); if (db_pager_quit) break; } } else { switch (g_valid_obj((void *)addr)) { case 1: db_show_geom_class((struct g_class *)addr); break; case 2: db_show_geom_geom(0, (struct g_geom *)addr); break; case 3: db_show_geom_consumer(0, (struct g_consumer *)addr); break; case 4: db_show_geom_provider(0, (struct g_provider *)addr); break; default: db_printf("Not a GEOM object.\n"); break; } } } static void db_print_bio_cmd(struct bio *bp) { db_printf(" cmd: "); switch (bp->bio_cmd) { case BIO_READ: db_printf("BIO_READ"); break; case BIO_WRITE: db_printf("BIO_WRITE"); break; case BIO_DELETE: db_printf("BIO_DELETE"); break; case BIO_GETATTR: db_printf("BIO_GETATTR"); break; case BIO_FLUSH: db_printf("BIO_FLUSH"); break; case BIO_CMD0: db_printf("BIO_CMD0"); break; case BIO_CMD1: db_printf("BIO_CMD1"); break; case BIO_CMD2: db_printf("BIO_CMD2"); break; case BIO_ZONE: db_printf("BIO_ZONE"); break; default: db_printf("UNKNOWN"); break; } db_printf("\n"); } static void db_print_bio_flags(struct bio *bp) { int comma; comma = 0; db_printf(" flags: "); if (bp->bio_flags & BIO_ERROR) { db_printf("BIO_ERROR"); comma = 1; } if (bp->bio_flags & BIO_DONE) { db_printf("%sBIO_DONE", (comma ? ", " : "")); comma = 1; } if (bp->bio_flags & BIO_ONQUEUE) db_printf("%sBIO_ONQUEUE", (comma ? ", " : "")); db_printf("\n"); } /* * Print useful information in a BIO */ DB_SHOW_COMMAND(bio, db_show_bio) { struct bio *bp; if (have_addr) { bp = (struct bio *)addr; db_printf("BIO %p\n", bp); db_print_bio_cmd(bp); db_print_bio_flags(bp); db_printf(" cflags: 0x%hx\n", bp->bio_cflags); db_printf(" pflags: 0x%hx\n", bp->bio_pflags); db_printf(" offset: %jd\n", (intmax_t)bp->bio_offset); db_printf(" length: %jd\n", (intmax_t)bp->bio_length); db_printf(" bcount: %ld\n", bp->bio_bcount); db_printf(" resid: %ld\n", bp->bio_resid); db_printf(" completed: %jd\n", (intmax_t)bp->bio_completed); db_printf(" children: %u\n", bp->bio_children); db_printf(" inbed: %u\n", bp->bio_inbed); db_printf(" error: %d\n", bp->bio_error); db_printf(" parent: %p\n", bp->bio_parent); db_printf(" driver1: %p\n", bp->bio_driver1); db_printf(" driver2: %p\n", bp->bio_driver2); db_printf(" caller1: %p\n", bp->bio_caller1); db_printf(" caller2: %p\n", bp->bio_caller2); db_printf(" bio_from: %p\n", bp->bio_from); db_printf(" bio_to: %p\n", bp->bio_to); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) db_printf(" bio_track_bp: %p\n", bp->bio_track_bp); #endif } } #undef gprintf #undef gprintln #undef ADDFLAG #endif /* DDB */ Index: head/sys/geom/geom_sunlabel.c =================================================================== --- head/sys/geom/geom_sunlabel.c (revision 326269) +++ head/sys/geom/geom_sunlabel.c (revision 326270) @@ -1,334 +1,336 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_sunlabel, "GEOM Sun/Solaris partitioning support"); #define SUNLABEL_CLASS_NAME "SUN" struct g_sunlabel_softc { int sectorsize; int nheads; int nsects; int nalt; u_char labelsum[16]; }; static int g_sunlabel_once = 0; static int g_sunlabel_modify(struct g_geom *gp, struct g_sunlabel_softc *ms, u_char *sec0) { int i, error; u_int u, v, csize; struct sun_disklabel sl; MD5_CTX md5sum; error = sunlabel_dec(sec0, &sl); if (error) return (error); csize = sl.sl_ntracks * sl.sl_nsectors; for (i = 0; i < SUN_NPART; i++) { v = sl.sl_part[i].sdkp_cyloffset; u = sl.sl_part[i].sdkp_nsectors; error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, ((off_t)v * csize) << 9ULL, ((off_t)u) << 9ULL, ms->sectorsize, "%s%c", gp->name, 'a' + i); if (error) return (error); } for (i = 0; i < SUN_NPART; i++) { v = sl.sl_part[i].sdkp_cyloffset; u = sl.sl_part[i].sdkp_nsectors; g_slice_config(gp, i, G_SLICE_CONFIG_SET, ((off_t)v * csize) << 9ULL, ((off_t)u) << 9ULL, ms->sectorsize, "%s%c", gp->name, 'a' + i); } ms->nalt = sl.sl_acylinders; ms->nheads = sl.sl_ntracks; ms->nsects = sl.sl_nsectors; /* * Calculate MD5 from the first sector and use it for avoiding * recursive labels creation. */ MD5Init(&md5sum); MD5Update(&md5sum, sec0, ms->sectorsize); MD5Final(ms->labelsum, &md5sum); return (0); } static void g_sunlabel_hotwrite(void *arg, int flag) { struct bio *bp; struct g_geom *gp; struct g_slicer *gsp; struct g_slice *gsl; struct g_sunlabel_softc *ms; u_char *p; int error; KASSERT(flag != EV_CANCEL, ("g_sunlabel_hotwrite cancelled")); bp = arg; gp = bp->bio_to->geom; gsp = gp->softc; ms = gsp->softc; gsl = &gsp->slices[bp->bio_to->index]; /* * XXX: For all practical purposes, this whould be equvivalent to * XXX: "p = (u_char *)bp->bio_data;" because the label is always * XXX: in the first sector and we refuse sectors smaller than the * XXX: label. */ p = (u_char *)bp->bio_data - (bp->bio_offset + gsl->offset); error = g_sunlabel_modify(gp, ms, p); if (error) { g_io_deliver(bp, EPERM); return; } g_slice_finish_hot(bp); } static void g_sunlabel_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_slicer *gsp; struct g_sunlabel_softc *ms; gsp = gp->softc; ms = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (indent == NULL) { sbuf_printf(sb, " sc %u hd %u alt %u", ms->nsects, ms->nheads, ms->nalt); } } struct g_hh01 { struct g_geom *gp; struct g_sunlabel_softc *ms; u_char *label; int error; }; static void g_sunlabel_callconfig(void *arg, int flag) { struct g_hh01 *hp; hp = arg; hp->error = g_sunlabel_modify(hp->gp, hp->ms, hp->label); if (!hp->error) hp->error = g_write_data(LIST_FIRST(&hp->gp->consumer), 0, hp->label, SUN_SIZE); } /* * NB! curthread is user process which GCTL'ed. */ static void g_sunlabel_config(struct gctl_req *req, struct g_class *mp, const char *verb) { u_char *label; int error, i; struct g_hh01 h0h0; struct g_slicer *gsp; struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; cp = LIST_FIRST(&gp->consumer); gsp = gp->softc; if (!strcmp(verb, "write label")) { label = gctl_get_paraml(req, "label", SUN_SIZE); if (label == NULL) return; h0h0.gp = gp; h0h0.ms = gsp->softc; h0h0.label = label; h0h0.error = -1; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } g_sunlabel_callconfig(&h0h0, 0); g_access(cp, -1, -1, -1); } else if (!strcmp(verb, "write bootcode")) { label = gctl_get_paraml(req, "bootcode", SUN_BOOTSIZE); if (label == NULL) return; /* XXX: Does this reference register with our selfdestruct code ? */ error = g_access(cp, 1, 1, 1); if (error) { gctl_error(req, "could not access consumer"); return; } for (i = 0; i < SUN_NPART; i++) { if (gsp->slices[i].length <= SUN_BOOTSIZE) continue; g_write_data(cp, gsp->slices[i].offset + SUN_SIZE, label + SUN_SIZE, SUN_BOOTSIZE - SUN_SIZE); } g_access(cp, -1, -1, -1); } else { gctl_error(req, "Unknown verb parameter"); } } static int g_sunlabel_start(struct bio *bp) { struct g_sunlabel_softc *mp; struct g_slicer *gsp; gsp = bp->bio_to->geom->softc; mp = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr(bp, "SUN::labelsum", mp->labelsum, sizeof(mp->labelsum))) return (1); } return (0); } static struct g_geom * g_sunlabel_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; struct g_sunlabel_softc *ms; struct g_slicer *gsp; u_char *buf, hash[16]; MD5_CTX md5sum; int error; g_trace(G_T_TOPOLOGY, "g_sunlabel_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (flags == G_TF_NORMAL && !strcmp(pp->geom->class->name, SUNLABEL_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, 8, pp, &cp, &ms, sizeof *ms, g_sunlabel_start); if (gp == NULL) return (NULL); gsp = gp->softc; do { ms->sectorsize = cp->provider->sectorsize; if (ms->sectorsize < 512) break; g_topology_unlock(); buf = g_read_data(cp, 0, ms->sectorsize, NULL); g_topology_lock(); if (buf == NULL) break; /* * Calculate MD5 from the first sector and use it for avoiding * recursive labels creation. */ MD5Init(&md5sum); MD5Update(&md5sum, buf, ms->sectorsize); MD5Final(ms->labelsum, &md5sum); error = g_getattr("SUN::labelsum", cp, &hash); if (!error && !bcmp(ms->labelsum, hash, sizeof(hash))) { g_free(buf); break; } g_sunlabel_modify(gp, ms, buf); g_free(buf); break; } while (0); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } g_slice_conf_hot(gp, 0, 0, SUN_SIZE, G_SLICE_HOT_ALLOW, G_SLICE_HOT_DENY, G_SLICE_HOT_CALL); gsp->hot = g_sunlabel_hotwrite; if (!g_sunlabel_once) { g_sunlabel_once = 1; printf( "WARNING: geom_sunlabel (geom %s) is deprecated, " "use gpart instead.\n", gp->name); } return (gp); } static struct g_class g_sunlabel_class = { .name = SUNLABEL_CLASS_NAME, .version = G_VERSION, .taste = g_sunlabel_taste, .ctlreq = g_sunlabel_config, .dumpconf = g_sunlabel_dumpconf, }; DECLARE_GEOM_CLASS(g_sunlabel_class, g_sunlabel); Index: head/sys/geom/geom_sunlabel_enc.c =================================================================== --- head/sys/geom/geom_sunlabel_enc.c (revision 326269) +++ head/sys/geom/geom_sunlabel_enc.c (revision 326270) @@ -1,182 +1,184 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2003 Jake Burkholder * Copyright (c) 2003 Poul-Henning Kamp * Copyright (c) 2004,2005 Joerg Wunsch * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Functions to encode or decode struct sun_disklabel into a bytestream * of correct endianness and packing. * * NB! This file must be usable both in kernel and userland. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #ifdef _KERNEL #include #else #include #endif #define SL_TEXT 0x0 #define SL_TEXT_SIZEOF 0x80 #define SL_VTOC_VERS 0x80 #define SL_VTOC_VOLNAME 0x84 #define SL_VTOC_NPART 0x8c #define SL_VTOC_MAP 0x8e #define SL_VTOC_SANITY 0xbc #define SL_RPM 0x1a4 #define SL_PCYLINDERS 0x1a6 #define SL_SPARESPERCYL 0x1a8 #define SL_INTERLEAVE 0x1ae #define SL_NCYLINDERS 0x1b0 #define SL_ACYLINDERS 0x1b2 #define SL_NTRACKS 0x1b4 #define SL_NSECTORS 0x1b6 #define SL_PART 0x1bc #define SL_MAGIC 0x1fc #define SL_CKSUM 0x1fe #define SDKP_CYLOFFSET 0 #define SDKP_NSECTORS 0x4 #define SDKP_SIZEOF 0x8 /* size of a partition entry */ #define SVTOC_TAG 0 #define SVTOC_FLAG 0x2 #define SVTOC_SIZEOF 0x4 /* size of a VTOC tag/flag entry */ /* * Decode the relevant fields of a sun disk label, and return zero if the * magic and checksum works out OK. */ int sunlabel_dec(void const *pp, struct sun_disklabel *sl) { const uint8_t *p; size_t i; u_int u; uint32_t vtocsane; uint16_t npart; p = pp; for (i = 0; i < sizeof(sl->sl_text); i++) sl->sl_text[i] = p[SL_TEXT + i]; sl->sl_rpm = be16dec(p + SL_RPM); sl->sl_pcylinders = be16dec(p + SL_PCYLINDERS); sl->sl_sparespercyl = be16dec(p + SL_SPARESPERCYL); sl->sl_interleave = be16dec(p + SL_INTERLEAVE); sl->sl_ncylinders = be16dec(p + SL_NCYLINDERS); sl->sl_acylinders = be16dec(p + SL_ACYLINDERS); sl->sl_ntracks = be16dec(p + SL_NTRACKS); sl->sl_nsectors = be16dec(p + SL_NSECTORS); for (i = 0; i < SUN_NPART; i++) { sl->sl_part[i].sdkp_cyloffset = be32dec(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_CYLOFFSET); sl->sl_part[i].sdkp_nsectors = be32dec(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_NSECTORS); } sl->sl_magic = be16dec(p + SL_MAGIC); vtocsane = be32dec(p + SL_VTOC_SANITY); npart = be16dec(p + SL_VTOC_NPART); if (vtocsane == SUN_VTOC_SANE && npart == SUN_NPART) { /* * Seems we've got SVR4-compatible VTOC information * as well, decode it. */ sl->sl_vtoc_sane = vtocsane; sl->sl_vtoc_vers = be32dec(p + SL_VTOC_VERS); memcpy(sl->sl_vtoc_volname, p + SL_VTOC_VOLNAME, SUN_VOLNAME_LEN); sl->sl_vtoc_nparts = SUN_NPART; for (i = 0; i < SUN_NPART; i++) { sl->sl_vtoc_map[i].svtoc_tag = be16dec(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_TAG); sl->sl_vtoc_map[i].svtoc_flag = be16dec(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_FLAG); } } for (i = u = 0; i < SUN_SIZE; i += 2) u ^= be16dec(p + i); if (u == 0 && sl->sl_magic == SUN_DKMAGIC) return (0); else return (EINVAL); } /* * Encode the relevant fields into a sun disklabel, compute new checksum. */ void sunlabel_enc(void *pp, struct sun_disklabel *sl) { uint8_t *p; size_t i; u_int u; p = pp; for (i = 0; i < SL_TEXT_SIZEOF; i++) p[SL_TEXT + i] = sl->sl_text[i]; be16enc(p + SL_RPM, sl->sl_rpm); be16enc(p + SL_PCYLINDERS, sl->sl_pcylinders); be16enc(p + SL_SPARESPERCYL, sl->sl_sparespercyl); be16enc(p + SL_INTERLEAVE, sl->sl_interleave); be16enc(p + SL_NCYLINDERS, sl->sl_ncylinders); be16enc(p + SL_ACYLINDERS, sl->sl_acylinders); be16enc(p + SL_NTRACKS, sl->sl_ntracks); be16enc(p + SL_NSECTORS, sl->sl_nsectors); for (i = 0; i < SUN_NPART; i++) { be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_CYLOFFSET, sl->sl_part[i].sdkp_cyloffset); be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_NSECTORS, sl->sl_part[i].sdkp_nsectors); } be16enc(p + SL_MAGIC, sl->sl_magic); if (sl->sl_vtoc_sane == SUN_VTOC_SANE && sl->sl_vtoc_nparts == SUN_NPART) { /* * Write SVR4-compatible VTOC elements. */ be32enc(p + SL_VTOC_VERS, sl->sl_vtoc_vers); be32enc(p + SL_VTOC_SANITY, SUN_VTOC_SANE); memcpy(p + SL_VTOC_VOLNAME, sl->sl_vtoc_volname, SUN_VOLNAME_LEN); be16enc(p + SL_VTOC_NPART, SUN_NPART); for (i = 0; i < SUN_NPART; i++) { be16enc(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_TAG, sl->sl_vtoc_map[i].svtoc_tag); be16enc(p + SL_VTOC_MAP + (i * SVTOC_SIZEOF) + SVTOC_FLAG, sl->sl_vtoc_map[i].svtoc_flag); } } for (i = u = 0; i < SUN_SIZE; i += 2) u ^= be16dec(p + i); be16enc(p + SL_CKSUM, u); } Index: head/sys/geom/geom_vfs.c =================================================================== --- head/sys/geom/geom_vfs.c (revision 326269) +++ head/sys/geom/geom_vfs.c (revision 326270) @@ -1,288 +1,290 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include /* * subroutines for use by filesystems. * * XXX: should maybe live somewhere else ? */ #include struct g_vfs_softc { struct mtx sc_mtx; struct bufobj *sc_bo; int sc_active; int sc_orphaned; }; static struct buf_ops __g_vfs_bufops = { .bop_name = "GEOM_VFS", .bop_write = bufwrite, .bop_strategy = g_vfs_strategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush }; struct buf_ops *g_vfs_bufops = &__g_vfs_bufops; static g_orphan_t g_vfs_orphan; static struct g_class g_vfs_class = { .name = "VFS", .version = G_VERSION, .orphan = g_vfs_orphan, }; DECLARE_GEOM_CLASS(g_vfs_class, g_vfs); static void g_vfs_destroy(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); if (cp->geom->softc == NULL) g_wither_geom(cp->geom, ENXIO); } static void g_vfs_done(struct bio *bip) { struct g_consumer *cp; struct g_vfs_softc *sc; struct buf *bp; int destroy; struct mount *mp; struct vnode *vp; struct cdev *cdevp; /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ bp = bip->bio_caller2; vp = bp->b_vp; if (vp != NULL) { /* * If not a disk vnode, use its associated mount point * otherwise use the mountpoint associated with the disk. */ VI_LOCK(vp); if (vp->v_type != VCHR || (cdevp = vp->v_rdev) == NULL || cdevp->si_devsw == NULL || (cdevp->si_devsw->d_flags & D_DISK) == 0) mp = vp->v_mount; else mp = cdevp->si_mountpt; if (mp != NULL) { if (bp->b_iocmd == BIO_READ) { if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC) mp->mnt_stat.f_asyncreads++; else mp->mnt_stat.f_syncreads++; } else if (bp->b_iocmd == BIO_WRITE) { if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC) mp->mnt_stat.f_asyncwrites++; else mp->mnt_stat.f_syncwrites++; } } VI_UNLOCK(vp); } cp = bip->bio_from; sc = cp->geom->softc; if (bip->bio_error) { printf("g_vfs_done():"); g_print_bio(bip); printf("error = %d\n", bip->bio_error); } bp->b_error = bip->bio_error; bp->b_ioflags = bip->bio_flags; if (bip->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bip->bio_completed; g_destroy_bio(bip); mtx_lock(&sc->sc_mtx); destroy = ((--sc->sc_active) == 0 && sc->sc_orphaned); mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_vfs_destroy, cp, M_WAITOK, NULL); bufdone(bp); } void g_vfs_strategy(struct bufobj *bo, struct buf *bp) { struct g_vfs_softc *sc; struct g_consumer *cp; struct bio *bip; cp = bo->bo_private; sc = cp->geom->softc; /* * If the provider has orphaned us, just return ENXIO. */ mtx_lock(&sc->sc_mtx); if (sc->sc_orphaned) { mtx_unlock(&sc->sc_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } sc->sc_active++; mtx_unlock(&sc->sc_mtx); bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bdata2bio(bp, bip); if ((bp->b_flags & B_BARRIER) != 0) { bip->bio_flags |= BIO_ORDERED; bp->b_flags &= ~B_BARRIER; } bip->bio_done = g_vfs_done; bip->bio_caller2 = bp; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) buf_track(bp, __func__); bip->bio_track_bp = bp; #endif g_io_request(bip, cp); } static void g_vfs_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_vfs_softc *sc; int destroy; g_topology_assert(); gp = cp->geom; g_trace(G_T_TOPOLOGY, "g_vfs_orphan(%p(%s))", cp, gp->name); sc = gp->softc; if (sc == NULL) return; mtx_lock(&sc->sc_mtx); sc->sc_orphaned = 1; destroy = (sc->sc_active == 0); mtx_unlock(&sc->sc_mtx); if (destroy) g_vfs_destroy(cp, 0); /* * Do not destroy the geom. Filesystem will do that during unmount. */ } int g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; struct g_vfs_softc *sc; struct bufobj *bo; int error; g_topology_assert(); *cpp = NULL; bo = &vp->v_bufobj; if (bo->bo_private != vp) return (EBUSY); pp = g_dev_getprovider(vp->v_rdev); if (pp == NULL) return (ENOENT); gp = g_new_geomf(&g_vfs_class, "%s.%s", fsname, pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_vfs", NULL, MTX_DEF); sc->sc_bo = bo; gp->softc = sc; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, wr, wr); if (error) { g_wither_geom(gp, ENXIO); return (error); } vnode_create_vobject(vp, pp->mediasize, curthread); *cpp = cp; cp->private = vp; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; bo->bo_ops = g_vfs_bufops; bo->bo_private = cp; bo->bo_bsize = pp->sectorsize; return (error); } void g_vfs_close(struct g_consumer *cp) { struct g_geom *gp; struct g_vfs_softc *sc; g_topology_assert(); gp = cp->geom; sc = gp->softc; bufobj_invalbuf(sc->sc_bo, V_SAVE, 0, 0); sc->sc_bo->bo_private = cp->private; gp->softc = NULL; mtx_destroy(&sc->sc_mtx); if (!sc->sc_orphaned || cp->provider == NULL) g_wither_geom_close(gp, ENXIO); g_free(sc); } Index: head/sys/geom/geom_vfs.h =================================================================== --- head/sys/geom/geom_vfs.h (revision 326269) +++ head/sys/geom/geom_vfs.h (revision 326270) @@ -1,42 +1,44 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_VFS_H_ #define _GEOM_GEOM_VFS_H_ struct vnode; struct bufobj; struct buf; extern struct buf_ops *g_vfs_bufops; void g_vfs_strategy(struct bufobj *bo, struct buf *bp); int g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr); void g_vfs_close(struct g_consumer *cp); #endif /* _GEOM_GEOM_VFS_H_ */ Index: head/sys/geom/geom_vol_ffs.c =================================================================== --- head/sys/geom/geom_vol_ffs.c (revision 326269) +++ head/sys/geom/geom_vol_ffs.c (revision 326270) @@ -1,164 +1,166 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002, 2003 Gordon Tetlow * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_vol, "GEOM support for volume names from UFS superblock"); #define VOL_FFS_CLASS_NAME "VOL_FFS" static int superblocks[] = SBLOCKSEARCH; static int g_vol_ffs_once; struct g_vol_ffs_softc { char * vol; }; static int g_vol_ffs_start(struct bio *bp __unused) { return(0); } static struct g_geom * g_vol_ffs_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_geom *gp; struct g_consumer *cp; struct g_vol_ffs_softc *ms; int sb, superblock; struct fs *fs; g_trace(G_T_TOPOLOGY, "vol_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); /* * XXX This is a really weak way to make sure we don't recurse. * Probably ought to use BIO_GETATTR to check for this. */ if (flags == G_TF_NORMAL && !strcmp(pp->geom->class->name, VOL_FFS_CLASS_NAME)) return (NULL); gp = g_slice_new(mp, 1, pp, &cp, &ms, sizeof(*ms), g_vol_ffs_start); if (gp == NULL) return (NULL); g_topology_unlock(); /* * Walk through the standard places that superblocks hide and look * for UFS magic. If we find magic, then check that the size in the * superblock corresponds to the size of the underlying provider. * Finally, look for a volume label and create an appropriate * provider based on that. */ for (sb=0; (superblock = superblocks[sb]) != -1; sb++) { /* * Take care not to issue an invalid I/O request. The * offset and size of the superblock candidate must be * multiples of the provider's sector size, otherwise an * FFS can't exist on the provider anyway. */ if (superblock % cp->provider->sectorsize != 0 || SBLOCKSIZE % cp->provider->sectorsize != 0) continue; fs = (struct fs *) g_read_data(cp, superblock, SBLOCKSIZE, NULL); if (fs == NULL) continue; /* Check for magic and make sure things are the right size */ if (fs->fs_magic == FS_UFS1_MAGIC) { if (fs->fs_old_size * fs->fs_fsize != (int32_t) pp->mediasize) { g_free(fs); continue; } } else if (fs->fs_magic == FS_UFS2_MAGIC) { if (fs->fs_size * fs->fs_fsize != (int64_t) pp->mediasize) { g_free(fs); continue; } } else { g_free(fs); continue; } /* Check for volume label */ if (fs->fs_volname[0] == '\0') { g_free(fs); continue; } /* XXX We need to check for namespace conflicts. */ /* XXX How do you handle a mirror set? */ /* XXX We don't validate the volume name. */ g_topology_lock(); /* Alright, we have a label and a volume name, reconfig. */ g_slice_config(gp, 0, G_SLICE_CONFIG_SET, (off_t) 0, pp->mediasize, pp->sectorsize, "vol/%s", fs->fs_volname); g_free(fs); g_topology_unlock(); break; } g_topology_lock(); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { g_slice_spoiled(cp); return (NULL); } if (!g_vol_ffs_once) { g_vol_ffs_once = 1; printf( "WARNING: geom_vol_Ffs (geom %s) is deprecated, " "use glabel instead.\n", gp->name); } return (gp); } static struct g_class g_vol_ffs_class = { .name = VOL_FFS_CLASS_NAME, .version = G_VERSION, .taste = g_vol_ffs_taste, }; DECLARE_GEOM_CLASS(g_vol_ffs_class, g_vol_ffs); Index: head/sys/geom/journal/g_journal.c =================================================================== --- head/sys/geom/journal/g_journal.c (revision 326269) +++ head/sys/geom/journal/g_journal.c (revision 326270) @@ -1,3011 +1,3013 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GJ_MEMDEBUG #include #include #endif #include #include #include #include FEATURE(geom_journal, "GEOM journaling support"); /* * On-disk journal format: * * JH - Journal header * RH - Record header * * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ... * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * */ CTASSERT(sizeof(struct g_journal_header) <= 512); CTASSERT(sizeof(struct g_journal_record_header) <= 512); static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data"); static struct mtx g_journal_cache_mtx; MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF); const struct g_journal_desc *g_journal_filesystems[] = { &g_journal_ufs, NULL }; SYSCTL_DECL(_kern_geom); int g_journal_debug = 0; static u_int g_journal_switch_time = 10; static u_int g_journal_force_switch = 70; static u_int g_journal_parallel_flushes = 16; static u_int g_journal_parallel_copies = 16; static u_int g_journal_accept_immediately = 64; static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES; static u_int g_journal_do_optimize = 1; static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, "GEOM_JOURNAL stuff"); SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW, &g_journal_switch_time, 0, "Switch journals every N seconds"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW, &g_journal_force_switch, 0, "Force switch when journal is N% full"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW, &g_journal_parallel_flushes, 0, "Number of flush I/O requests to send in parallel"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW, &g_journal_accept_immediately, 0, "Number of I/O requests accepted immediately"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW, &g_journal_parallel_copies, 0, "Number of copy I/O requests to send in parallel"); static int g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS) { u_int entries; int error; entries = g_journal_record_entries; error = sysctl_handle_int(oidp, &entries, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); g_journal_record_entries = entries; return (0); } SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I", "Maximum number of entires in one journal record"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW, &g_journal_do_optimize, 0, "Try to combine bios on flush and copy"); static u_long g_journal_cache_used = 0; static u_long g_journal_cache_limit = 64 * 1024 * 1024; static u_int g_journal_cache_divisor = 2; static u_int g_journal_cache_switch = 90; static u_int g_journal_cache_misses = 0; static u_int g_journal_cache_alloc_failures = 0; static u_long g_journal_cache_low = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0, "GEOM_JOURNAL cache"); SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD, &g_journal_cache_used, 0, "Number of allocated bytes"); static int g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS) { u_long limit; int error; limit = g_journal_cache_limit; error = sysctl_handle_long(oidp, &limit, 0, req); if (error != 0 || req->newptr == NULL) return (error); g_journal_cache_limit = limit; g_journal_cache_low = (limit / 100) * g_journal_cache_switch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit, CTLTYPE_ULONG | CTLFLAG_RWTUN, NULL, 0, g_journal_cache_limit_sysctl, "I", "Maximum number of allocated bytes"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN, &g_journal_cache_divisor, 0, "(kmem_size / kern.geom.journal.cache.divisor) == cache size"); static int g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS) { u_int cswitch; int error; cswitch = g_journal_cache_switch; error = sysctl_handle_int(oidp, &cswitch, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (cswitch > 100) return (EINVAL); g_journal_cache_switch = cswitch; g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I", "Force switch when we hit this percent of cache use"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW, &g_journal_cache_misses, 0, "Number of cache misses"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW, &g_journal_cache_alloc_failures, 0, "Memory allocation failures"); static u_long g_journal_stats_bytes_skipped = 0; static u_long g_journal_stats_combined_ios = 0; static u_long g_journal_stats_switches = 0; static u_long g_journal_stats_wait_for_copy = 0; static u_long g_journal_stats_journal_full = 0; static u_long g_journal_stats_low_mem = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0, "GEOM_JOURNAL statistics"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW, &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW, &g_journal_stats_combined_ios, 0, "Number of combined I/O requests"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW, &g_journal_stats_switches, 0, "Number of journal switches"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW, &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW, &g_journal_stats_journal_full, 0, "Number of times journal was almost full."); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW, &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called."); static g_taste_t g_journal_taste; static g_ctl_req_t g_journal_config; static g_dumpconf_t g_journal_dumpconf; static g_init_t g_journal_init; static g_fini_t g_journal_fini; struct g_class g_journal_class = { .name = G_JOURNAL_CLASS_NAME, .version = G_VERSION, .taste = g_journal_taste, .ctlreq = g_journal_config, .dumpconf = g_journal_dumpconf, .init = g_journal_init, .fini = g_journal_fini }; static int g_journal_destroy(struct g_journal_softc *sc); static void g_journal_metadata_update(struct g_journal_softc *sc); static void g_journal_start_switcher(struct g_class *mp); static void g_journal_stop_switcher(void); static void g_journal_switch_wait(struct g_journal_softc *sc); #define GJ_SWITCHER_WORKING 0 #define GJ_SWITCHER_DIE 1 #define GJ_SWITCHER_DIED 2 static struct proc *g_journal_switcher_proc = NULL; static int g_journal_switcher_state = GJ_SWITCHER_WORKING; static int g_journal_switcher_wokenup = 0; static int g_journal_sync_requested = 0; #ifdef GJ_MEMDEBUG struct meminfo { size_t mi_size; struct stack mi_stack; }; #endif /* * We use our own malloc/realloc/free funtions, so we can collect statistics * and force journal switch when we're running out of cache. */ static void * gj_malloc(size_t size, int flags) { void *p; #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif mtx_lock(&g_journal_cache_mtx); if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup && g_journal_cache_used + size > g_journal_cache_low) { GJ_DEBUG(1, "No cache, waking up the switcher."); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); } if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 && g_journal_cache_used + size > g_journal_cache_limit) { mtx_unlock(&g_journal_cache_mtx); g_journal_cache_alloc_failures++; return (NULL); } g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); flags &= ~M_NOWAIT; #ifndef GJ_MEMDEBUG p = malloc(size, M_JOURNAL, flags | M_WAITOK); #else mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK); p = (u_char *)mi + sizeof(*mi); mi->mi_size = size; stack_save(&mi->mi_stack); #endif return (p); } static void gj_free(void *p, size_t size) { #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif KASSERT(p != NULL, ("p=NULL")); KASSERT(size > 0, ("size=0")); mtx_lock(&g_journal_cache_mtx); KASSERT(g_journal_cache_used >= size, ("Freeing too much?")); g_journal_cache_used -= size; mtx_unlock(&g_journal_cache_mtx); #ifdef GJ_MEMDEBUG mi = p = (void *)((u_char *)p - sizeof(*mi)); if (mi->mi_size != size) { printf("GJOURNAL: Size mismatch! %zu != %zu\n", size, mi->mi_size); printf("GJOURNAL: Alloc backtrace:\n"); stack_print(&mi->mi_stack); printf("GJOURNAL: Free backtrace:\n"); kdb_backtrace(); } #endif free(p, M_JOURNAL); } static void * gj_realloc(void *p, size_t size, size_t oldsize) { void *np; #ifndef GJ_MEMDEBUG mtx_lock(&g_journal_cache_mtx); g_journal_cache_used -= oldsize; g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); np = realloc(p, size, M_JOURNAL, M_WAITOK); #else np = gj_malloc(size, M_WAITOK); bcopy(p, np, MIN(oldsize, size)); gj_free(p, oldsize); #endif return (np); } static void g_journal_check_overflow(struct g_journal_softc *sc) { off_t length, used; if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset) || (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset && sc->sc_journal_offset < sc->sc_active.jj_offset)) { panic("Journal overflow " "(id = %u joffset=%jd active=%jd inactive=%jd)", (unsigned)sc->sc_id, (intmax_t)sc->sc_journal_offset, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset); } if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) { length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset; used = sc->sc_journal_offset - sc->sc_active.jj_offset; } else { length = sc->sc_jend - sc->sc_active.jj_offset; length += sc->sc_inactive.jj_offset - sc->sc_jstart; if (sc->sc_journal_offset >= sc->sc_active.jj_offset) used = sc->sc_journal_offset - sc->sc_active.jj_offset; else { used = sc->sc_jend - sc->sc_active.jj_offset; used += sc->sc_journal_offset - sc->sc_jstart; } } /* Already woken up? */ if (g_journal_switcher_wokenup) return; /* * If the active journal takes more than g_journal_force_switch precent * of free journal space, we force journal switch. */ KASSERT(length > 0, ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd", (intmax_t)length, (intmax_t)used, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset, (intmax_t)sc->sc_journal_offset)); if ((used * 100) / length > g_journal_force_switch) { g_journal_stats_journal_full++; GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.", sc->sc_name, (used * 100) / length); mtx_lock(&g_journal_cache_mtx); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); mtx_unlock(&g_journal_cache_mtx); } } static void g_journal_orphan(struct g_consumer *cp) { struct g_journal_softc *sc; char name[256]; int error; g_topology_assert(); sc = cp->geom->softc; strlcpy(name, cp->provider->name, sizeof(name)); GJ_DEBUG(0, "Lost provider %s.", name); if (sc == NULL) return; error = g_journal_destroy(sc); if (error == 0) GJ_DEBUG(0, "Journal %s destroyed.", name); else { GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). " "Destroy it manually after last close.", sc->sc_name, error); } } static int g_journal_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_journal_softc *sc; int dcr, dcw, dce; g_topology_assert(); GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); else return (ENXIO); } if (pp->acw == 0 && dcw > 0) { GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name); sc->sc_flags &= ~GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); sc->sc_flags |= GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } */ return (0); } static void g_journal_header_encode(struct g_journal_header *hdr, u_char *data) { bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC)); data += sizeof(GJ_HEADER_MAGIC); le32enc(data, hdr->jh_journal_id); data += 4; le32enc(data, hdr->jh_journal_next_id); } static int g_journal_header_decode(const u_char *data, struct g_journal_header *hdr) { bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic)); data += sizeof(hdr->jh_magic); if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0) return (EINVAL); hdr->jh_journal_id = le32dec(data); data += 4; hdr->jh_journal_next_id = le32dec(data); return (0); } static void g_journal_flush_cache(struct g_journal_softc *sc) { struct bintime bt; int error; if (sc->sc_bio_flush == 0) return; GJ_TIMER_START(1, &bt); if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) { error = g_io_flush(sc->sc_jconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_jconsumer->provider->name, error); } if (sc->sc_bio_flush & GJ_FLUSH_DATA) { /* * TODO: This could be called in parallel with the * previous call. */ error = g_io_flush(sc->sc_dconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_dconsumer->provider->name, error); } GJ_TIMER_STOP(1, &bt, "Cache flush time"); } static int g_journal_write_header(struct g_journal_softc *sc) { struct g_journal_header hdr; struct g_consumer *cp; u_char *buf; int error; cp = sc->sc_jconsumer; buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic)); hdr.jh_journal_id = sc->sc_journal_id; hdr.jh_journal_next_id = sc->sc_journal_next_id; g_journal_header_encode(&hdr, buf); error = g_write_data(cp, sc->sc_journal_offset, buf, cp->provider->sectorsize); /* if (error == 0) */ sc->sc_journal_offset += cp->provider->sectorsize; gj_free(buf, cp->provider->sectorsize); return (error); } /* * Every journal record has a header and data following it. * Functions below are used to decode the header before storing it to * little endian and to encode it after reading to system endianness. */ static void g_journal_record_header_encode(struct g_journal_record_header *hdr, u_char *data) { struct g_journal_entry *ent; u_int i; bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC)); data += sizeof(GJ_RECORD_HEADER_MAGIC); le32enc(data, hdr->jrh_journal_id); data += 8; le16enc(data, hdr->jrh_nentries); data += 2; bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; le64enc(data, ent->je_joffset); data += 8; le64enc(data, ent->je_offset); data += 8; le64enc(data, ent->je_length); data += 8; } } static int g_journal_record_header_decode(const u_char *data, struct g_journal_record_header *hdr) { struct g_journal_entry *ent; u_int i; bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic)); data += sizeof(hdr->jrh_magic); if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0) return (EINVAL); hdr->jrh_journal_id = le32dec(data); data += 8; hdr->jrh_nentries = le16dec(data); data += 2; if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; ent->je_joffset = le64dec(data); data += 8; ent->je_offset = le64dec(data); data += 8; ent->je_length = le64dec(data); data += 8; } return (0); } /* * Function reads metadata from a provider (via the given consumer), decodes * it to system endianness and verifies its correctness. */ static int g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata is stored in last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = journal_metadata_decode(buf, md); g_free(buf); /* Is this is gjournal provider at all? */ if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0) return (EINVAL); /* * Are we able to handle this version of metadata? * We only maintain backward compatibility. */ if (md->md_version > G_JOURNAL_VERSION) { GJ_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } /* Is checksum correct? */ if (error != 0) { GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } /* * Two functions below are responsible for updating metadata. * Only metadata on the data provider is updated (we need to update * information about active journal in there). */ static void g_journal_metadata_done(struct bio *bp) { /* * There is not much we can do on error except informing about it. */ if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).", bp->bio_error); } else { GJ_LOGREQ(2, bp, "Metadata updated."); } gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(bp); } static void g_journal_metadata_update(struct g_journal_softc *sc) { struct g_journal_metadata md; struct g_consumer *cp; struct bio *bp; u_char *sector; cp = sc->sc_dconsumer; sector = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic)); md.md_version = G_JOURNAL_VERSION; md.md_id = sc->sc_id; md.md_type = sc->sc_orig_type; md.md_jstart = sc->sc_jstart; md.md_jend = sc->sc_jend; md.md_joffset = sc->sc_inactive.jj_offset; md.md_jid = sc->sc_journal_previous_id; md.md_flags = 0; if (sc->sc_flags & GJF_DEVICE_CLEAN) md.md_flags |= GJ_FLAG_CLEAN; if (sc->sc_flags & GJF_DEVICE_HARDCODED) strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider)); else bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = cp->provider->mediasize; journal_metadata_encode(&md, sector); /* * Flush the cache, so we know all data are on disk. * We write here informations like "journal is consistent", so we need * to be sure it is. Without BIO_FLUSH here, we can end up in situation * where metadata is stored on disk, but not all data. */ g_journal_flush_cache(sc); bp = g_alloc_bio(); bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize; bp->bio_length = cp->provider->sectorsize; bp->bio_data = sector; bp->bio_cmd = BIO_WRITE; if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) { bp->bio_done = g_journal_metadata_done; g_io_request(bp, cp); } else { bp->bio_done = NULL; g_io_request(bp, cp); biowait(bp, "gjmdu"); g_journal_metadata_done(bp); } /* * Be sure metadata reached the disk. */ g_journal_flush_cache(sc); } /* * This is where the I/O request comes from the GEOM. */ static void g_journal_start(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_to->geom->softc; GJ_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_regular_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); return; case BIO_GETATTR: if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) { strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length); bp->bio_completed = strlen(bp->bio_to->name) + 1; g_io_deliver(bp, 0); return; } /* FALLTHROUGH */ case BIO_DELETE: default: g_io_deliver(bp, EOPNOTSUPP); return; } } static void g_journal_std_done(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_from->geom->softc; mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_back_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); } static struct bio * g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data, int flags) { struct bio *bp; bp = g_alloc_bio(); bp->bio_offset = start; bp->bio_joffset = joffset; bp->bio_length = end - start; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; if (data == NULL) bp->bio_data = NULL; else { bp->bio_data = gj_malloc(bp->bio_length, flags); if (bp->bio_data != NULL) bcopy(data, bp->bio_data, bp->bio_length); } return (bp); } #define g_journal_insert_bio(head, bp, flags) \ g_journal_insert((head), (bp)->bio_offset, \ (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \ (bp)->bio_data, flags) /* * The function below does a lot more than just inserting bio to the queue. * It keeps the queue sorted by offset and ensures that there are no doubled * data (it combines bios where ranges overlap). * * The function returns the number of bios inserted (as bio can be splitted). */ static int g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset, u_char *data, int flags) { struct bio *nbp, *cbp, *pbp; off_t cstart, cend; u_char *tmpdata; int n; GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend, joffset); n = 0; pbp = NULL; GJQ_FOREACH(*head, cbp) { cstart = cbp->bio_offset; cend = cbp->bio_offset + cbp->bio_length; if (nstart >= cend) { /* * +-------------+ * | | * | current | +-------------+ * | bio | | | * | | | new | * +-------------+ | bio | * | | * +-------------+ */ GJ_DEBUG(3, "INSERT(%p): 1", *head); } else if (nend <= cstart) { /* * +-------------+ * | | * +-------------+ | current | * | | | bio | * | new | | | * | bio | +-------------+ * | | * +-------------+ */ nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; n++; GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp, pbp); goto end; } else if (nstart <= cstart && nend >= cend) { /* * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+---+ +-------------+---+ * | | | | | | | * | | | | | | | * | +-------------+ | +-------------+ | * | new bio | | new bio | * +---------------------+ +-----------------+ * * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+ +-------------+ * | | | | | * | | | | | * | +-------------+ +-------------+ * | new bio | | new bio | * +-----------------+ +-------------+ */ g_journal_stats_bytes_skipped += cbp->bio_length; cbp->bio_offset = nstart; cbp->bio_joffset = joffset; cbp->bio_length = cend - nstart; if (cbp->bio_data != NULL) { gj_free(cbp->bio_data, cend - cstart); cbp->bio_data = NULL; } if (data != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(data, cbp->bio_data, cbp->bio_length); } data += cend - nstart; } joffset += cend - nstart; nstart = cend; GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend >= cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * | +-------------+ | +---------+---+ * | | | | | | | * | | | | | | | * +---+-------------+ +---+---------+ | * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += cend - nstart; nbp = g_journal_new_bio(nstart, cend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } if (data != NULL) data += cend - nstart; joffset += cend - nstart; nstart = cend; n++; GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend < cend) { /* * +---------------------+ * | current bio | * | +-------------+ | * | | | | * | | | | * +---+-------------+---+ * | new bio | * +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; if (cbp->bio_data == NULL) tmpdata = NULL; else tmpdata = cbp->bio_data + nend - cstart; nbp = g_journal_new_bio(nend, cend, cbp->bio_joffset + nend - cstart, tmpdata, flags); nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next; ((struct bio *)cbp->bio_next)->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } n += 2; GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp); goto end; } else if (nstart <= cstart && nend < cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * +-------------+ | +---+---------+ | * | | | | | | | * | | | | | | | * +-------------+---+ | +---------+---+ * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; cbp->bio_offset = nend; cbp->bio_length = cend - nend; cbp->bio_joffset += nend - cstart; tmpdata = cbp->bio_data; if (tmpdata != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(tmpdata + nend - cstart, cbp->bio_data, cbp->bio_length); } gj_free(tmpdata, cend - cstart); } n++; GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp); goto end; } if (nstart == nend) goto end; pbp = cbp; } nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = NULL; n++; GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp); end: if (g_journal_debug >= 3) { GJQ_FOREACH(*head, cbp) { GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp, (intmax_t)cbp->bio_offset, (intmax_t)cbp->bio_length, (intmax_t)cbp->bio_joffset, cbp->bio_data); } GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n); } return (n); } /* * The function combines neighbour bios trying to squeeze as much data as * possible into one bio. * * The function returns the number of bios combined (negative value). */ static int g_journal_optimize(struct bio *head) { struct bio *cbp, *pbp; int n; n = 0; pbp = NULL; GJQ_FOREACH(head, cbp) { /* Skip bios which has to be read first. */ if (cbp->bio_data == NULL) { pbp = NULL; continue; } /* There is no previous bio yet. */ if (pbp == NULL) { pbp = cbp; continue; } /* Is this a neighbour bio? */ if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) { /* Be sure that bios queue is sorted. */ KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset, ("poffset=%jd plength=%jd coffset=%jd", (intmax_t)pbp->bio_offset, (intmax_t)pbp->bio_length, (intmax_t)cbp->bio_offset)); pbp = cbp; continue; } /* Be sure we don't end up with too big bio. */ if (pbp->bio_length + cbp->bio_length > MAXPHYS) { pbp = cbp; continue; } /* Ok, we can join bios. */ GJ_LOGREQ(4, pbp, "Join: "); GJ_LOGREQ(4, cbp, "and: "); pbp->bio_data = gj_realloc(pbp->bio_data, pbp->bio_length + cbp->bio_length, pbp->bio_length); bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length, cbp->bio_length); gj_free(cbp->bio_data, cbp->bio_length); pbp->bio_length += cbp->bio_length; pbp->bio_next = cbp->bio_next; g_destroy_bio(cbp); cbp = pbp; g_journal_stats_combined_ios++; n--; GJ_LOGREQ(4, pbp, "Got: "); } return (n); } /* * TODO: Update comment. * These are functions responsible for copying one portion of data from journal * to the destination provider. * The order goes like this: * 1. Read the header, which contains informations about data blocks * following it. * 2. Read the data blocks from the journal. * 3. Write the data blocks on the data provider. * * g_journal_copy_start() * g_journal_copy_done() - got finished write request, logs potential errors. */ /* * When there is no data in cache, this function is used to read it. */ static void g_journal_read_first(struct g_journal_softc *sc, struct bio *bp) { struct bio *cbp; /* * We were short in memory, so data was freed. * In that case we need to read it back from journal. */ cbp = g_alloc_bio(); cbp->bio_cflags = bp->bio_cflags; cbp->bio_parent = bp; cbp->bio_offset = bp->bio_joffset; cbp->bio_length = bp->bio_length; cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK); cbp->bio_cmd = BIO_READ; cbp->bio_done = g_journal_std_done; GJ_LOGREQ(4, cbp, "READ FIRST"); g_io_request(cbp, sc->sc_jconsumer); g_journal_cache_misses++; } static void g_journal_copy_send(struct g_journal_softc *sc) { struct bio *bioq, *bp, *lbp; bioq = lbp = NULL; mtx_lock(&sc->sc_mtx); for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) { bp = GJQ_FIRST(sc->sc_inactive.jj_queue); if (bp == NULL) break; GJQ_REMOVE(sc->sc_inactive.jj_queue, bp); sc->sc_copy_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } mtx_unlock(&sc->sc_mtx); if (g_journal_do_optimize) sc->sc_copy_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJQ_INSERT_HEAD(sc->sc_copy_queue, bp); bp->bio_cflags = GJ_BIO_COPY; if (bp->bio_data == NULL) g_journal_read_first(sc, bp); else { bp->bio_joffset = 0; GJ_LOGREQ(4, bp, "SEND"); g_io_request(bp, sc->sc_dconsumer); } } } static void g_journal_copy_start(struct g_journal_softc *sc) { /* * Remember in metadata that we're starting to copy journaled data * to the data provider. * In case of power failure, we will copy these data once again on boot. */ if (!sc->sc_journal_copying) { sc->sc_journal_copying = 1; GJ_DEBUG(1, "Starting copy of journal."); g_journal_metadata_update(sc); } g_journal_copy_send(sc); } /* * Data block has been read from the journal provider. */ static int g_journal_copy_read_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; pbp = bp->bio_parent; if (bp->bio_error != 0) { GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); /* * We will not be able to deliver WRITE request as well. */ gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(pbp); g_destroy_bio(bp); sc->sc_copy_in_progress--; return (1); } pbp->bio_data = bp->bio_data; cp = sc->sc_dconsumer; g_io_request(pbp, cp); GJ_LOGREQ(4, bp, "READ DONE"); g_destroy_bio(bp); return (0); } /* * Data block has been written to the data provider. */ static void g_journal_copy_write_done(struct bio *bp) { struct g_journal_softc *sc; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; sc->sc_copy_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)", bp->bio_error); } GJQ_REMOVE(sc->sc_copy_queue, bp); gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); if (sc->sc_copy_in_progress == 0) { /* * This was the last write request for this journal. */ GJ_DEBUG(1, "Data has been copied."); sc->sc_journal_copying = 0; } } static void g_journal_flush_done(struct bio *bp); /* * Flush one record onto active journal provider. */ static void g_journal_flush(struct g_journal_softc *sc) { struct g_journal_record_header hdr; struct g_journal_entry *ent; struct g_provider *pp; struct bio **bioq; struct bio *bp, *fbp, *pbp; off_t joffset; u_char *data, hash[16]; MD5_CTX ctx; u_int i; if (sc->sc_current_count == 0) return; pp = sc->sc_jprovider; GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); joffset = sc->sc_journal_offset; GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.", sc->sc_current_count, pp->name, (intmax_t)joffset); /* * Store 'journal id', so we know to which journal this record belongs. */ hdr.jrh_journal_id = sc->sc_journal_id; /* Could be less than g_journal_record_entries if called due timeout. */ hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries); strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic)); bioq = &sc->sc_active.jj_queue; GJQ_LAST(sc->sc_flush_queue, pbp); fbp = g_alloc_bio(); fbp->bio_parent = NULL; fbp->bio_cflags = GJ_BIO_JOURNAL; fbp->bio_offset = -1; fbp->bio_joffset = joffset; fbp->bio_length = pp->sectorsize; fbp->bio_cmd = BIO_WRITE; fbp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp); pbp = fbp; fbp->bio_to = pp; GJ_LOGREQ(4, fbp, "FLUSH_OUT"); joffset += pp->sectorsize; sc->sc_flush_count++; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < hdr.jrh_nentries; i++) { bp = sc->sc_current_queue; KASSERT(bp != NULL, ("NULL bp")); bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSHED"); sc->sc_current_queue = bp->bio_next; bp->bio_next = NULL; sc->sc_current_count--; /* Add to the header. */ ent = &hdr.jrh_entries[i]; ent->je_offset = bp->bio_offset; ent->je_joffset = joffset; ent->je_length = bp->bio_length; data = bp->bio_data; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Update(&ctx, data, ent->je_length); g_reset_bio(bp); bp->bio_cflags = GJ_BIO_JOURNAL; bp->bio_offset = ent->je_offset; bp->bio_joffset = ent->je_joffset; bp->bio_length = ent->je_length; bp->bio_data = data; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp); pbp = bp; bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSH_OUT"); joffset += bp->bio_length; sc->sc_flush_count++; /* * Add request to the active sc_journal_queue queue. * This is our cache. After journal switch we don't have to * read the data from the inactive journal, because we keep * it in memory. */ g_journal_insert(bioq, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, data, M_NOWAIT); } /* * After all requests, store valid header. */ data = gj_malloc(pp->sectorsize, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(hash, &ctx); bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum)); } g_journal_record_header_encode(&hdr, data); fbp->bio_data = data; sc->sc_journal_offset = joffset; g_journal_check_overflow(sc); } /* * Flush request finished. */ static void g_journal_flush_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL)); cp = bp->bio_from; sc = cp->geom->softc; sc->sc_flush_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)", bp->bio_error); } gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); } static void g_journal_release_delayed(struct g_journal_softc *sc); static void g_journal_flush_send(struct g_journal_softc *sc) { struct g_consumer *cp; struct bio *bioq, *bp, *lbp; cp = sc->sc_jconsumer; bioq = lbp = NULL; while (sc->sc_flush_in_progress < g_journal_parallel_flushes) { /* Send one flush requests to the active journal. */ bp = GJQ_FIRST(sc->sc_flush_queue); if (bp != NULL) { GJQ_REMOVE(sc->sc_flush_queue, bp); sc->sc_flush_count--; bp->bio_offset = bp->bio_joffset; bp->bio_joffset = 0; sc->sc_flush_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } /* Try to release delayed requests. */ g_journal_release_delayed(sc); /* If there are no requests to flush, leave. */ if (GJQ_FIRST(sc->sc_flush_queue) == NULL) break; } if (g_journal_do_optimize) sc->sc_flush_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJ_LOGREQ(3, bp, "Flush request send"); g_io_request(bp, cp); } } static void g_journal_add_current(struct g_journal_softc *sc, struct bio *bp) { int n; GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count); n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK); sc->sc_current_count += n; n = g_journal_optimize(sc->sc_current_queue); sc->sc_current_count += n; /* * For requests which are added to the current queue we deliver * response immediately. */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); if (sc->sc_current_count >= g_journal_record_entries) { /* * Let's flush one record onto active journal provider. */ g_journal_flush(sc); } } static void g_journal_release_delayed(struct g_journal_softc *sc) { struct bio *bp; for (;;) { /* The flush queue is full, exit. */ if (sc->sc_flush_count >= g_journal_accept_immediately) return; bp = bioq_takefirst(&sc->sc_delayed_queue); if (bp == NULL) return; sc->sc_delayed_count--; g_journal_add_current(sc, bp); } } /* * Add I/O request to the current queue. If we have enough requests for one * journal record we flush them onto active journal provider. */ static void g_journal_add_request(struct g_journal_softc *sc, struct bio *bp) { /* * The flush queue is full, we need to delay the request. */ if (sc->sc_delayed_count > 0 || sc->sc_flush_count >= g_journal_accept_immediately) { GJ_LOGREQ(4, bp, "DELAYED"); bioq_insert_tail(&sc->sc_delayed_queue, bp); sc->sc_delayed_count++; return; } KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue), ("DELAYED queue not empty.")); g_journal_add_current(sc, bp); } static void g_journal_read_done(struct bio *bp); /* * Try to find requested data in cache. */ static struct bio * g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart, off_t oend) { off_t cstart, cend; struct bio *bp; GJQ_FOREACH(head, bp) { if (bp->bio_offset == -1) continue; cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); if (cend <= ostart) continue; else if (cstart >= oend) { if (!sorted) continue; else { bp = NULL; break; } } if (bp->bio_data == NULL) break; GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, bp); bcopy(bp->bio_data + cstart - bp->bio_offset, pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); pbp->bio_completed += cend - cstart; if (pbp->bio_completed == pbp->bio_length) { /* * Cool, the whole request was in cache, deliver happy * message. */ g_io_deliver(pbp, 0); return (pbp); } break; } return (bp); } /* * This function is used for collecting data on read. * The complexity is because parts of the data can be stored in four different * places: * - in memory - the data not yet send to the active journal provider * - in the active journal * - in the inactive journal * - in the data provider */ static void g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart, off_t oend) { struct bio *bp, *nbp, *head; off_t cstart, cend; u_int i, sorted = 0; GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend); cstart = cend = -1; bp = NULL; head = NULL; for (i = 1; i <= 5; i++) { switch (i) { case 1: /* Not-yet-send data. */ head = sc->sc_current_queue; sorted = 1; break; case 2: /* Skip flush queue as they are also in active queue */ continue; case 3: /* Active journal. */ head = sc->sc_active.jj_queue; sorted = 1; break; case 4: /* Inactive journal. */ /* * XXX: Here could be a race with g_journal_lowmem(). */ head = sc->sc_inactive.jj_queue; sorted = 1; break; case 5: /* In-flight to the data provider. */ head = sc->sc_copy_queue; sorted = 0; break; default: panic("gjournal %s: i=%d", __func__, i); } bp = g_journal_read_find(head, sorted, pbp, ostart, oend); if (bp == pbp) { /* Got the whole request. */ GJ_DEBUG(2, "Got the whole request from %u.", i); return; } else if (bp != NULL) { cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).", i, (intmax_t)cstart, (intmax_t)cend); break; } } if (bp != NULL) { if (bp->bio_data == NULL) { nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + cstart - pbp->bio_offset; nbp->bio_offset = bp->bio_joffset + cstart - bp->bio_offset; nbp->bio_length = cend - cstart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_jconsumer); } /* * If we don't have the whole request yet, call g_journal_read() * recursively. */ if (ostart < cstart) g_journal_read(sc, pbp, ostart, cstart); if (oend > cend) g_journal_read(sc, pbp, cend, oend); } else { /* * No data in memory, no data in journal. * Its time for asking data provider. */ GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend); nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset; nbp->bio_offset = ostart; nbp->bio_length = oend - ostart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_dconsumer); /* We have the whole request, return here. */ return; } } /* * Function responsible for handling finished READ requests. * Actually, g_std_done() could be used here, the only difference is that we * log error. */ static void g_journal_read_done(struct bio *bp) { struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_READ, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ)); pbp = bp->bio_parent; pbp->bio_inbed++; pbp->bio_completed += bp->bio_length; if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); } g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed && pbp->bio_completed == pbp->bio_length) { /* We're done. */ g_io_deliver(pbp, 0); } } /* * Deactive current journal and active next one. */ static void g_journal_switch(struct g_journal_softc *sc) { struct g_provider *pp; if (JEMPTY(sc)) { GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); pp = LIST_FIRST(&sc->sc_geom->provider); if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) { sc->sc_flags |= GJF_DEVICE_CLEAN; GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); } } else { GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name); pp = sc->sc_jprovider; sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_id = sc->sc_journal_next_id; sc->sc_journal_next_id = arc4random(); GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); g_journal_write_header(sc); sc->sc_inactive.jj_offset = sc->sc_active.jj_offset; sc->sc_inactive.jj_queue = sc->sc_active.jj_queue; sc->sc_active.jj_offset = sc->sc_journal_offset - pp->sectorsize; sc->sc_active.jj_queue = NULL; /* * Switch is done, start copying data from the (now) inactive * journal to the data provider. */ g_journal_copy_start(sc); } mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_SWITCH; mtx_unlock(&sc->sc_mtx); } static void g_journal_initialize(struct g_journal_softc *sc) { sc->sc_journal_id = arc4random(); sc->sc_journal_next_id = arc4random(); sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_offset = sc->sc_jstart; sc->sc_inactive.jj_offset = sc->sc_jstart; g_journal_write_header(sc); sc->sc_active.jj_offset = sc->sc_jstart; } static void g_journal_mark_as_dirty(struct g_journal_softc *sc) { const struct g_journal_desc *desc; int i; GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name); for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++) desc->jd_dirty(sc->sc_dconsumer); } /* * Function read record header from the given journal. * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio * and data on every call. */ static int g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset, void *data) { int error; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = cp->provider->sectorsize; bp->bio_data = data; g_io_request(bp, cp); error = biowait(bp, "gjs_read"); return (error); } #if 0 /* * Function is called when we start the journal device and we detect that * one of the journals was not fully copied. * The purpose of this function is to read all records headers from journal * and placed them in the inactive queue, so we can start journal * synchronization process and the journal provider itself. * Design decision was taken to not synchronize the whole journal here as it * can take too much time. Reading headers only and delaying synchronization * process until after journal provider is started should be the best choice. */ #endif static void g_journal_sync(struct g_journal_softc *sc) { struct g_journal_record_header rhdr; struct g_journal_entry *ent; struct g_journal_header jhdr; struct g_consumer *cp; struct bio *bp, *fbp, *tbp; off_t joffset, offset; u_char *buf, sum[16]; uint64_t id; MD5_CTX ctx; int error, found, i; found = 0; fbp = NULL; cp = sc->sc_jconsumer; bp = g_alloc_bio(); buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset; GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset); /* * Read and decode first journal header. */ error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { GJ_DEBUG(0, "Error while reading journal header from %s.", cp->provider->name); goto end; } error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(0, "Cannot decode journal header from %s.", cp->provider->name); goto end; } id = sc->sc_journal_id; if (jhdr.jh_journal_id != sc->sc_journal_id) { GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); goto end; } offset += cp->provider->sectorsize; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; for (;;) { /* * If the biggest record won't fit, look for a record header or * journal header from the beginning. */ GJ_VALIDATE_OFFSET(offset, sc); error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { /* * Not good. Having an error while reading header * means, that we cannot read next headers and in * consequence we cannot find termination. */ GJ_DEBUG(0, "Error while reading record header from %s.", cp->provider->name); break; } error = g_journal_record_header_decode(buf, &rhdr); if (error != 0) { GJ_DEBUG(2, "Not a record header at %jd (error=%d).", (intmax_t)offset, error); /* * This is not a record header. * If we are lucky, this is next journal header. */ error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(1, "Not a journal header at %jd (error=%d).", (intmax_t)offset, error); /* * Nope, this is not journal header, which * bascially means that journal is not * terminated properly. */ error = ENOENT; break; } /* * Ok. This is header of _some_ journal. Now we need to * verify if this is header of the _next_ journal. */ if (jhdr.jh_journal_id != id) { GJ_DEBUG(1, "Journal ID mismatch at %jd " "(0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); error = ENOENT; break; } /* Found termination. */ found++; GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).", (intmax_t)offset, (u_int)id); sc->sc_active.jj_offset = offset; sc->sc_journal_offset = offset + cp->provider->sectorsize; sc->sc_journal_id = id; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; GJ_LOGREQ(3, tbp, "Adding request."); g_journal_insert_bio(&sc->sc_inactive.jj_queue, tbp, M_WAITOK); } /* Skip journal's header. */ offset += cp->provider->sectorsize; continue; } /* Skip record's header. */ offset += cp->provider->sectorsize; /* * Add information about every record entry to the inactive * queue. */ if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < rhdr.jrh_nentries; i++) { ent = &rhdr.jrh_entries[i]; GJ_DEBUG(3, "Insert entry: %jd %jd.", (intmax_t)ent->je_offset, (intmax_t)ent->je_length); g_journal_insert(&fbp, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, NULL, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { u_char *buf2; /* * TODO: Should use faster function (like * g_journal_sync_read()). */ buf2 = g_read_data(cp, offset, ent->je_length, NULL); if (buf2 == NULL) GJ_DEBUG(0, "Cannot read data at %jd.", (intmax_t)offset); else { MD5Update(&ctx, buf2, ent->je_length); g_free(buf2); } } /* Skip entry's data. */ offset += ent->je_length; } if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(sum, &ctx); if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) { GJ_DEBUG(0, "MD5 hash mismatch at %jd!", (intmax_t)offset); } } } end: gj_free(bp->bio_data, cp->provider->sectorsize); g_destroy_bio(bp); /* Remove bios from unterminated journal. */ while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; g_destroy_bio(tbp); } if (found < 1 && joffset > 0) { GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.", sc->sc_name); while ((tbp = sc->sc_inactive.jj_queue) != NULL) { sc->sc_inactive.jj_queue = tbp->bio_next; g_destroy_bio(tbp); } g_journal_initialize(sc); g_journal_mark_as_dirty(sc); } else { GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name); g_journal_copy_start(sc); } } /* * Wait for requests. * If we have requests in the current queue, flush them after 3 seconds from the * last flush. In this way we don't wait forever (or for journal switch) with * storing not full records on journal. */ static void g_journal_wait(struct g_journal_softc *sc, time_t last_write) { int error, timeout; GJ_DEBUG(3, "%s: enter", __func__); if (sc->sc_current_count == 0) { if (g_journal_debug < 2) msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0); else { /* * If we have debug turned on, show number of elements * in various queues. */ for (;;) { error = msleep(sc, &sc->sc_mtx, PRIBIO, "gj:work", hz * 3); if (error == 0) { mtx_unlock(&sc->sc_mtx); break; } GJ_DEBUG(3, "Report: current count=%d", sc->sc_current_count); GJ_DEBUG(3, "Report: flush count=%d", sc->sc_flush_count); GJ_DEBUG(3, "Report: flush in progress=%d", sc->sc_flush_in_progress); GJ_DEBUG(3, "Report: copy in progress=%d", sc->sc_copy_in_progress); GJ_DEBUG(3, "Report: delayed=%d", sc->sc_delayed_count); } } GJ_DEBUG(3, "%s: exit 1", __func__); return; } /* * Flush even not full records every 3 seconds. */ timeout = (last_write + 3 - time_second) * hz; if (timeout <= 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 2", __func__); return; } error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout); if (error == EWOULDBLOCK) g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 3", __func__); } /* * Worker thread. */ static void g_journal_worker(void *arg) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *bp; time_t last_write; int type; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sc = arg; type = 0; /* gcc */ if (sc->sc_flags & GJF_DEVICE_CLEAN) { GJ_DEBUG(0, "Journal %s clean.", sc->sc_name); g_journal_initialize(sc); } else { g_journal_sync(sc); } /* * Check if we can use BIO_FLUSH. */ sc->sc_bio_flush = 0; if (g_io_flush(sc->sc_jconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_JOURNAL; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_jconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_jconsumer->provider->name); } if (sc->sc_jconsumer != sc->sc_dconsumer) { if (g_io_flush(sc->sc_dconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_DATA; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_dconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_dconsumer->provider->name); } } gp = sc->sc_geom; g_topology_lock(); pp = g_new_providerf(gp, "%s.journal", sc->sc_name); pp->mediasize = sc->sc_mediasize; /* * There could be a problem when data provider and journal providers * have different sectorsize, but such scenario is prevented on journal * creation. */ pp->sectorsize = sc->sc_sectorsize; g_error_provider(pp, 0); g_topology_unlock(); last_write = time_second; if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } for (;;) { /* Get first request from the queue. */ mtx_lock(&sc->sc_mtx); bp = bioq_first(&sc->sc_back_queue); if (bp != NULL) type = (bp->bio_cflags & GJ_BIO_MASK); if (bp == NULL) { bp = bioq_first(&sc->sc_regular_queue); if (bp != NULL) type = GJ_BIO_REGULAR; } if (bp == NULL) { try_switch: if ((sc->sc_flags & GJF_DEVICE_SWITCH) || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (sc->sc_current_count > 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); continue; } if (sc->sc_flush_in_progress > 0) goto sleep; if (sc->sc_copy_in_progress > 0) goto sleep; } if (sc->sc_flags & GJF_DEVICE_SWITCH) { mtx_unlock(&sc->sc_mtx); g_journal_switch(sc); wakeup(&sc->sc_journal_copying); continue; } if (sc->sc_flags & GJF_DEVICE_DESTROY) { GJ_DEBUG(1, "Shutting down worker " "thread for %s.", gp->name); sc->sc_worker = NULL; wakeup(&sc->sc_worker); mtx_unlock(&sc->sc_mtx); kproc_exit(0); } sleep: g_journal_wait(sc, last_write); continue; } /* * If we're in switch process, we need to delay all new * write requests until its done. */ if ((sc->sc_flags & GJF_DEVICE_SWITCH) && type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) { GJ_LOGREQ(2, bp, "WRITE on SWITCH"); goto try_switch; } if (type == GJ_BIO_REGULAR) bioq_remove(&sc->sc_regular_queue, bp); else bioq_remove(&sc->sc_back_queue, bp); mtx_unlock(&sc->sc_mtx); switch (type) { case GJ_BIO_REGULAR: /* Regular request. */ switch (bp->bio_cmd) { case BIO_READ: g_journal_read(sc, bp, bp->bio_offset, bp->bio_offset + bp->bio_length); break; case BIO_WRITE: last_write = time_second; g_journal_add_request(sc, bp); g_journal_flush_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_COPY: switch (bp->bio_cmd) { case BIO_READ: if (g_journal_copy_read_done(bp)) g_journal_copy_send(sc); break; case BIO_WRITE: g_journal_copy_write_done(bp); g_journal_copy_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_JOURNAL: g_journal_flush_done(bp); g_journal_flush_send(sc); break; case GJ_BIO_READ: default: panic("Invalid bio (%d).", type); } } } static void g_journal_destroy_event(void *arg, int flags __unused) { struct g_journal_softc *sc; g_topology_assert(); sc = arg; g_journal_destroy(sc); } static void g_journal_timeout(void *arg) { struct g_journal_softc *sc; sc = arg; GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.", sc->sc_geom->name); g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL); } static struct g_geom * g_journal_create(struct g_class *mp, struct g_provider *pp, const struct g_journal_metadata *md) { struct g_journal_softc *sc; struct g_geom *gp; struct g_consumer *cp; int error; sc = NULL; /* gcc */ g_topology_assert(); /* * There are two possibilities: * 1. Data and both journals are on the same provider. * 2. Data and journals are all on separated providers. */ /* Look for journal device with the same ID. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_id == md->md_id) break; } if (gp == NULL) sc = NULL; else if (sc != NULL && (sc->sc_type & md->md_type) != 0) { GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id); return (NULL); } if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) { GJ_DEBUG(0, "Invalid type on %s.", pp->name); return (NULL); } if (md->md_type & GJ_TYPE_DATA) { GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id, pp->name); } if (md->md_type & GJ_TYPE_JOURNAL) { GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id, pp->name); } if (sc == NULL) { /* Action geom. */ sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO); sc->sc_id = md->md_id; sc->sc_type = 0; sc->sc_flags = 0; sc->sc_worker = NULL; gp = g_new_geomf(mp, "gjournal %u", sc->sc_id); gp->start = g_journal_start; gp->orphan = g_journal_orphan; gp->access = g_journal_access; gp->softc = sc; gp->flags |= G_GEOM_VOLATILE_BIO; sc->sc_geom = gp; mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF); bioq_init(&sc->sc_back_queue); bioq_init(&sc->sc_regular_queue); bioq_init(&sc->sc_delayed_queue); sc->sc_delayed_count = 0; sc->sc_current_queue = NULL; sc->sc_current_count = 0; sc->sc_flush_queue = NULL; sc->sc_flush_count = 0; sc->sc_flush_in_progress = 0; sc->sc_copy_queue = NULL; sc->sc_copy_in_progress = 0; sc->sc_inactive.jj_queue = NULL; sc->sc_active.jj_queue = NULL; sc->sc_rootmount = root_mount_hold("GJOURNAL"); GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); callout_init(&sc->sc_callout, 1); if (md->md_type != GJ_TYPE_COMPLETE) { /* * Journal and data are on separate providers. * At this point we have only one of them. * We setup a timeout in case the other part will not * appear, so we won't wait forever. */ callout_reset(&sc->sc_callout, 5 * hz, g_journal_timeout, sc); } } /* Remember type of the data provider. */ if (md->md_type & GJ_TYPE_DATA) sc->sc_orig_type = md->md_type; sc->sc_type |= md->md_type; cp = NULL; if (md->md_type & GJ_TYPE_DATA) { if (md->md_flags & GJ_FLAG_CLEAN) sc->sc_flags |= GJF_DEVICE_CLEAN; if (md->md_flags & GJ_FLAG_CHECKSUM) sc->sc_flags |= GJF_DEVICE_CHECKSUM; cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } sc->sc_dconsumer = cp; sc->sc_mediasize = pp->mediasize - pp->sectorsize; sc->sc_sectorsize = pp->sectorsize; sc->sc_jstart = md->md_jstart; sc->sc_jend = md->md_jend; if (md->md_provider[0] != '\0') sc->sc_flags |= GJF_DEVICE_HARDCODED; sc->sc_journal_offset = md->md_joffset; sc->sc_journal_id = md->md_jid; sc->sc_journal_previous_id = md->md_jid; } if (md->md_type & GJ_TYPE_JOURNAL) { if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } } else { /* * Journal is on the same provider as data, which means * that data provider ends where journal starts. */ sc->sc_mediasize = md->md_jstart; } sc->sc_jconsumer = cp; } /* Start switcher kproc if needed. */ if (g_journal_switcher_proc == NULL) g_journal_start_switcher(mp); if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) { /* Journal is not complete yet. */ return (gp); } else { /* Journal complete, cancel timeout. */ callout_drain(&sc->sc_callout); } error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0, "g_journal %s", sc->sc_name); if (error != 0) { GJ_DEBUG(0, "Cannot create worker thread for %s.journal.", sc->sc_name); g_journal_destroy(sc); return (NULL); } return (gp); } static void g_journal_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; g_detach(cp); g_destroy_consumer(cp); } static int g_journal_destroy(struct g_journal_softc *sc) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_error_provider(pp, ENXIO); g_journal_flush(sc); g_journal_flush_send(sc); g_journal_switch(sc); } sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN); g_topology_unlock(); if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); wakeup(sc); while (sc->sc_worker != NULL) msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0); mtx_unlock(&sc->sc_mtx); if (pp != NULL) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); g_topology_lock(); g_wither_provider(pp, ENXIO); } else { g_topology_lock(); } mtx_destroy(&sc->sc_mtx); if (sc->sc_current_count != 0) { GJ_DEBUG(0, "Warning! Number of current requests %d.", sc->sc_current_count); } LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->acr + cp->acw + cp->ace > 0) g_access(cp, -1, -1, -1); /* * We keep all consumers open for writting, so if I'll detach * and destroy consumer here, I'll get providers for taste, so * journal will be started again. * Sending an event here, prevents this from happening. */ g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL); } gp->softc = NULL; g_wither_geom(gp, ENXIO); free(sc, M_JOURNAL); return (0); } static void g_journal_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_journal_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); GJ_DEBUG(2, "Tasting %s.", pp->name); if (pp->geom->class == mp) return (NULL); gp = g_new_geomf(mp, "journal:taste"); /* This orphan function should be never called. */ gp->orphan = g_journal_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_journal_metadata_read(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_journal_debug >= 2) journal_metadata_dump(&md); gp = g_journal_create(mp, pp, &md); return (gp); } static struct g_journal_softc * g_journal_find_device(struct g_class *mp, const char *name) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; if (strncmp(name, "/dev/", 5) == 0) name += 5; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; pp = LIST_FIRST(&gp->provider); if (strcmp(sc->sc_name, name) == 0) return (sc); if (pp != NULL && strcmp(pp->name, name) == 0) return (sc); } return (NULL); } static void g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_journal_softc *sc; const char *name; char param[16]; int *nargs; int error, i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_journal_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_journal_destroy(sc); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", LIST_FIRST(&sc->sc_geom->provider)->name, error); return; } } } static void g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused) { g_topology_assert(); g_topology_unlock(); g_journal_sync_requested++; wakeup(&g_journal_switcher_state); while (g_journal_sync_requested > 0) tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2); g_topology_lock(); } static void g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_JOURNAL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_journal_ctl_destroy(req, mp); return; } else if (strcmp(verb, "sync") == 0) { g_journal_ctl_sync(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_journal_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { int first = 1; sbuf_printf(sb, "%s", indent); if (cp == sc->sc_dconsumer) { sbuf_printf(sb, "Data"); first = 0; } if (cp == sc->sc_jconsumer) { if (!first) sbuf_printf(sb, ","); sbuf_printf(sb, "Journal"); } sbuf_printf(sb, "\n"); if (cp == sc->sc_jconsumer) { sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jstart); sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jend); } } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); } } static eventhandler_tag g_journal_event_shutdown = NULL; static eventhandler_tag g_journal_event_lowmem = NULL; static void g_journal_shutdown(void *arg, int howto __unused) { struct g_class *mp; struct g_geom *gp, *gp2; if (panicstr != NULL) return; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; GJ_DEBUG(0, "Shutting down geom %s.", gp->name); g_journal_destroy(gp->softc); } g_topology_unlock(); } /* * Free cached requests from inactive queue in case of low memory. * We free GJ_FREE_AT_ONCE elements at once. */ #define GJ_FREE_AT_ONCE 4 static void g_journal_lowmem(void *arg, int howto __unused) { struct g_journal_softc *sc; struct g_class *mp; struct g_geom *gp; struct bio *bp; u_int nfree = GJ_FREE_AT_ONCE; g_journal_stats_low_mem++; mp = arg; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) continue; mtx_lock(&sc->sc_mtx); for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL; nfree--, bp = bp->bio_next) { /* * This is safe to free the bio_data, because: * 1. If bio_data is NULL it will be read from the * inactive journal. * 2. If bp is sent down, it is first removed from the * inactive queue, so it's impossible to free the * data from under in-flight bio. * On the other hand, freeing elements from the active * queue, is not safe. */ if (bp->bio_data != NULL) { GJ_DEBUG(2, "Freeing data from %s.", sc->sc_name); gj_free(bp->bio_data, bp->bio_length); bp->bio_data = NULL; } } mtx_unlock(&sc->sc_mtx); if (nfree == 0) break; } g_topology_unlock(); } static void g_journal_switcher(void *arg); static void g_journal_init(struct g_class *mp) { /* Pick a conservative value if provided value sucks. */ if (g_journal_cache_divisor <= 0 || (vm_kmem_size / g_journal_cache_divisor == 0)) { g_journal_cache_divisor = 5; } if (g_journal_cache_limit > 0) { g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor; g_journal_cache_low = (g_journal_cache_limit / 100) * g_journal_cache_switch; } g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync, g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_shutdown == NULL) GJ_DEBUG(0, "Warning! Cannot register shutdown event."); g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_lowmem == NULL) GJ_DEBUG(0, "Warning! Cannot register lowmem event."); } static void g_journal_fini(struct g_class *mp) { if (g_journal_event_shutdown != NULL) { EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_journal_event_shutdown); } if (g_journal_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem); g_journal_stop_switcher(); } DECLARE_GEOM_CLASS(g_journal_class, g_journal); static const struct g_journal_desc * g_journal_find_desc(const char *fstype) { const struct g_journal_desc *desc; int i; for (desc = g_journal_filesystems[i = 0]; desc != NULL; desc = g_journal_filesystems[++i]) { if (strcmp(desc->jd_fstype, fstype) == 0) break; } return (desc); } static void g_journal_switch_wait(struct g_journal_softc *sc) { struct bintime bt; mtx_assert(&sc->sc_mtx, MA_OWNED); if (g_journal_debug >= 2) { if (sc->sc_flush_in_progress > 0) { GJ_DEBUG(2, "%d requests flushing.", sc->sc_flush_in_progress); } if (sc->sc_copy_in_progress > 0) { GJ_DEBUG(2, "%d requests copying.", sc->sc_copy_in_progress); } if (sc->sc_flush_count > 0) { GJ_DEBUG(2, "%d requests to flush.", sc->sc_flush_count); } if (sc->sc_delayed_count > 0) { GJ_DEBUG(2, "%d requests delayed.", sc->sc_delayed_count); } } g_journal_stats_switches++; if (sc->sc_copy_in_progress > 0) g_journal_stats_wait_for_copy++; GJ_TIMER_START(1, &bt); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; sc->sc_flags |= GJF_DEVICE_SWITCH; wakeup(sc); while (sc->sc_flags & GJF_DEVICE_SWITCH) { msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO, "gj:switch", 0); } GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name); } static void g_journal_do_switch(struct g_class *classp) { struct g_journal_softc *sc; const struct g_journal_desc *desc; struct g_geom *gp; struct mount *mp; struct bintime bt; char *mountpoint; int error, save; g_topology_lock(); LIST_FOREACH(gp, &classp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; mtx_lock(&sc->sc_mtx); sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); } g_topology_unlock(); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_gjprovider == NULL) continue; if (mp->mnt_flag & MNT_RDONLY) continue; desc = g_journal_find_desc(mp->mnt_stat.f_fstypename); if (desc == NULL) continue; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */ g_topology_lock(); sc = g_journal_find_device(classp, mp->mnt_gjprovider); g_topology_unlock(); if (sc == NULL) { GJ_DEBUG(0, "Cannot find journal geom for %s.", mp->mnt_gjprovider); goto next; } else if (JEMPTY(sc)) { mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); goto next; } mountpoint = mp->mnt_stat.f_mntonname; error = vn_start_write(NULL, &mp, V_WAIT); if (error != 0) { GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).", mountpoint, error); goto next; } save = curthread_pflags_set(TDP_SYNCIO); GJ_TIMER_START(1, &bt); vfs_msync(mp, MNT_NOWAIT); GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint); GJ_TIMER_START(1, &bt); error = VFS_SYNC(mp, MNT_NOWAIT); if (error == 0) GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint); else { GJ_DEBUG(0, "Cannot sync file system %s (error=%d).", mountpoint, error); } curthread_pflags_restore(save); vn_finished_write(mp); if (error != 0) goto next; /* * Send BIO_FLUSH before freezing the file system, so it can be * faster after the freeze. */ GJ_TIMER_START(1, &bt); g_journal_flush_cache(sc); GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name); GJ_TIMER_START(1, &bt); error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT); GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint); if (error != 0) { GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).", mountpoint, error); goto next; } error = desc->jd_clean(mp); if (error != 0) goto next; mtx_lock(&sc->sc_mtx); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); vfs_write_resume(mp, 0); next: mtx_lock(&mountlist_mtx); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); sc = NULL; for (;;) { g_topology_lock(); LIST_FOREACH(gp, &g_journal_class.geom, geom) { sc = gp->softc; if (sc == NULL) continue; mtx_lock(&sc->sc_mtx); if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE && !(sc->sc_flags & GJF_DEVICE_DESTROY) && (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) { break; } mtx_unlock(&sc->sc_mtx); sc = NULL; } g_topology_unlock(); if (sc == NULL) break; mtx_assert(&sc->sc_mtx, MA_OWNED); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); } } static void g_journal_start_switcher(struct g_class *mp) { int error; g_topology_assert(); MPASS(g_journal_switcher_proc == NULL); g_journal_switcher_state = GJ_SWITCHER_WORKING; error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc, 0, 0, "g_journal switcher"); KASSERT(error == 0, ("Cannot create switcher thread.")); } static void g_journal_stop_switcher(void) { g_topology_assert(); MPASS(g_journal_switcher_proc != NULL); g_journal_switcher_state = GJ_SWITCHER_DIE; wakeup(&g_journal_switcher_state); while (g_journal_switcher_state != GJ_SWITCHER_DIED) tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5); GJ_DEBUG(1, "Switcher died."); g_journal_switcher_proc = NULL; } /* * TODO: Kill switcher thread on last geom destruction? */ static void g_journal_switcher(void *arg) { struct g_class *mp; struct bintime bt; int error; mp = arg; curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { g_journal_switcher_wokenup = 0; error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait", g_journal_switch_time * hz); if (g_journal_switcher_state == GJ_SWITCHER_DIE) { g_journal_switcher_state = GJ_SWITCHER_DIED; GJ_DEBUG(1, "Switcher exiting."); wakeup(&g_journal_switcher_state); kproc_exit(0); } if (error == 0 && g_journal_sync_requested == 0) { GJ_DEBUG(1, "Out of cache, force switch (used=%jd " "limit=%jd).", (intmax_t)g_journal_cache_used, (intmax_t)g_journal_cache_limit); } GJ_TIMER_START(1, &bt); g_journal_do_switch(mp); GJ_TIMER_STOP(1, &bt, "Entire switch time"); if (g_journal_sync_requested > 0) { g_journal_sync_requested = 0; wakeup(&g_journal_sync_requested); } } } Index: head/sys/geom/journal/g_journal.h =================================================================== --- head/sys/geom/journal/g_journal.h (revision 326269) +++ head/sys/geom/journal/g_journal.h (revision 326270) @@ -1,392 +1,394 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_JOURNAL_H_ #define _G_JOURNAL_H_ #include #include #ifdef _KERNEL #include #endif #define G_JOURNAL_CLASS_NAME "JOURNAL" #define G_JOURNAL_MAGIC "GEOM::JOURNAL" /* * Version history: * 0 - Initial version number. */ #define G_JOURNAL_VERSION 0 #ifdef _KERNEL extern int g_journal_debug; #define GJ_DEBUG(lvl, ...) do { \ if (g_journal_debug >= (lvl)) { \ printf("GEOM_JOURNAL"); \ if (g_journal_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define GJ_LOGREQ(lvl, bp, ...) do { \ if (g_journal_debug >= (lvl)) { \ printf("GEOM_JOURNAL"); \ if (g_journal_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #define JEMPTY(sc) ((sc)->sc_journal_offset - \ (sc)->sc_jprovider->sectorsize == \ (sc)->sc_active.jj_offset && \ (sc)->sc_current_count == 0) #define GJ_BIO_REGULAR 0x00 #define GJ_BIO_READ 0x01 #define GJ_BIO_JOURNAL 0x02 #define GJ_BIO_COPY 0x03 #define GJ_BIO_MASK 0x0f #if 0 #define GJF_BIO_DONT_FREE 0x10 #define GJF_BIO_MASK 0xf0 #endif #define GJF_DEVICE_HARDCODED 0x0001 #define GJF_DEVICE_DESTROY 0x0010 #define GJF_DEVICE_SWITCH 0x0020 #define GJF_DEVICE_BEFORE_SWITCH 0x0040 #define GJF_DEVICE_CLEAN 0x0080 #define GJF_DEVICE_CHECKSUM 0x0100 #define GJ_HARD_LIMIT 64 /* * We keep pointers to journaled data in bio structure and because we * need to store two off_t values (offset in data provider and offset in * journal), we have to borrow bio_completed field for this. */ #define bio_joffset bio_completed /* * Use bio_caller1 field as a pointer in queue. */ #define bio_next bio_caller1 /* * There are two such structures maintained inside each journaled device. * One describes active part of the journal, were recent requests are stored. * The second describes the last consistent part of the journal with requests * that are copied to the destination provider. */ struct g_journal_journal { struct bio *jj_queue; /* Cached journal entries. */ off_t jj_offset; /* Journal's start offset. */ }; struct g_journal_softc { uint32_t sc_id; uint8_t sc_type; uint8_t sc_orig_type; struct g_geom *sc_geom; u_int sc_flags; struct mtx sc_mtx; off_t sc_mediasize; u_int sc_sectorsize; #define GJ_FLUSH_DATA 0x01 #define GJ_FLUSH_JOURNAL 0x02 u_int sc_bio_flush; uint32_t sc_journal_id; uint32_t sc_journal_next_id; int sc_journal_copying; off_t sc_journal_offset; off_t sc_journal_previous_id; struct bio_queue_head sc_back_queue; struct bio_queue_head sc_regular_queue; struct bio_queue_head sc_delayed_queue; int sc_delayed_count; struct bio *sc_current_queue; int sc_current_count; struct bio *sc_flush_queue; int sc_flush_count; int sc_flush_in_progress; struct bio *sc_copy_queue; int sc_copy_in_progress; struct g_consumer *sc_dconsumer; struct g_consumer *sc_jconsumer; struct g_journal_journal sc_inactive; struct g_journal_journal sc_active; off_t sc_jstart; /* Journal space start offset. */ off_t sc_jend; /* Journal space end offset. */ struct callout sc_callout; struct proc *sc_worker; struct root_hold_token *sc_rootmount; }; #define sc_dprovider sc_dconsumer->provider #define sc_jprovider sc_jconsumer->provider #define sc_name sc_dprovider->name #define GJQ_INSERT_HEAD(head, bp) do { \ (bp)->bio_next = (head); \ (head) = (bp); \ } while (0) #define GJQ_INSERT_AFTER(head, bp, pbp) do { \ if ((pbp) == NULL) \ GJQ_INSERT_HEAD(head, bp); \ else { \ (bp)->bio_next = (pbp)->bio_next; \ (pbp)->bio_next = (bp); \ } \ } while (0) #define GJQ_LAST(head, bp) do { \ struct bio *_bp; \ \ if ((head) == NULL) { \ (bp) = (head); \ break; \ } \ for (_bp = (head); _bp->bio_next != NULL; _bp = _bp->bio_next) \ continue; \ (bp) = (_bp); \ } while (0) #define GJQ_FIRST(head) (head) #define GJQ_REMOVE(head, bp) do { \ struct bio *_bp; \ \ if ((head) == (bp)) { \ (head) = (bp)->bio_next; \ (bp)->bio_next = NULL; \ break; \ } \ for (_bp = (head); _bp->bio_next != NULL; _bp = _bp->bio_next) {\ if (_bp->bio_next == (bp)) \ break; \ } \ KASSERT(_bp->bio_next != NULL, ("NULL bio_next")); \ KASSERT(_bp->bio_next == (bp), ("bio_next != bp")); \ _bp->bio_next = (bp)->bio_next; \ (bp)->bio_next = NULL; \ } while (0) #define GJQ_FOREACH(head, bp) \ for ((bp) = (head); (bp) != NULL; (bp) = (bp)->bio_next) #define GJ_HEADER_MAGIC "GJHDR" struct g_journal_header { char jh_magic[sizeof(GJ_HEADER_MAGIC)]; uint32_t jh_journal_id; uint32_t jh_journal_next_id; } __packed; struct g_journal_entry { uint64_t je_joffset; uint64_t je_offset; uint64_t je_length; } __packed; #define GJ_RECORD_HEADER_MAGIC "GJRHDR" #define GJ_RECORD_HEADER_NENTRIES (20) #define GJ_RECORD_MAX_SIZE(sc) \ ((sc)->sc_jprovider->sectorsize + GJ_RECORD_HEADER_NENTRIES * MAXPHYS) #define GJ_VALIDATE_OFFSET(offset, sc) do { \ if ((offset) + GJ_RECORD_MAX_SIZE(sc) >= (sc)->sc_jend) { \ (offset) = (sc)->sc_jstart; \ GJ_DEBUG(2, "Starting from the beginning (%s).", \ (sc)->sc_name); \ } \ } while (0) struct g_journal_record_header { char jrh_magic[sizeof(GJ_RECORD_HEADER_MAGIC)]; uint32_t jrh_journal_id; uint16_t jrh_nentries; u_char jrh_sum[8]; struct g_journal_entry jrh_entries[GJ_RECORD_HEADER_NENTRIES]; } __packed; typedef int (g_journal_clean_t)(struct mount *mp); typedef void (g_journal_dirty_t)(struct g_consumer *cp); struct g_journal_desc { const char *jd_fstype; g_journal_clean_t *jd_clean; g_journal_dirty_t *jd_dirty; }; /* Supported file systems. */ extern const struct g_journal_desc g_journal_ufs; #define GJ_TIMER_START(lvl, bt) do { \ if (g_journal_debug >= (lvl)) \ binuptime(bt); \ } while (0) #define GJ_TIMER_STOP(lvl, bt, ...) do { \ if (g_journal_debug >= (lvl)) { \ struct bintime _bt2; \ struct timeval _tv; \ \ binuptime(&_bt2); \ bintime_sub(&_bt2, bt); \ bintime2timeval(&_bt2, &_tv); \ printf("GEOM_JOURNAL"); \ if (g_journal_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(": %jd.%06jds\n", (intmax_t)_tv.tv_sec, \ (intmax_t)_tv.tv_usec); \ } \ } while (0) #endif /* _KERNEL */ #define GJ_TYPE_DATA 0x01 #define GJ_TYPE_JOURNAL 0x02 #define GJ_TYPE_COMPLETE (GJ_TYPE_DATA|GJ_TYPE_JOURNAL) #define GJ_FLAG_CLEAN 0x01 #define GJ_FLAG_CHECKSUM 0x02 struct g_journal_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ uint32_t md_id; /* Journal unique ID. */ uint8_t md_type; /* Provider type. */ uint64_t md_jstart; /* Journal space start offset. */ uint64_t md_jend; /* Journal space end offset. */ uint64_t md_joffset; /* Last known consistent journal offset. */ uint32_t md_jid; /* Last known consistent journal ID. */ uint64_t md_flags; /* Journal flags. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ u_char md_hash[16]; /* MD5 hash. */ }; static __inline void journal_metadata_encode(struct g_journal_metadata *md, u_char *data) { MD5_CTX ctx; bcopy(md->md_magic, data, 16); le32enc(data + 16, md->md_version); le32enc(data + 20, md->md_id); *(data + 24) = md->md_type; le64enc(data + 25, md->md_jstart); le64enc(data + 33, md->md_jend); le64enc(data + 41, md->md_joffset); le32enc(data + 49, md->md_jid); le64enc(data + 53, md->md_flags); bcopy(md->md_provider, data + 61, 16); le64enc(data + 77, md->md_provsize); MD5Init(&ctx); MD5Update(&ctx, data, 85); MD5Final(md->md_hash, &ctx); bcopy(md->md_hash, data + 85, 16); } static __inline int journal_metadata_decode_v0(const u_char *data, struct g_journal_metadata *md) { MD5_CTX ctx; md->md_id = le32dec(data + 20); md->md_type = *(data + 24); md->md_jstart = le64dec(data + 25); md->md_jend = le64dec(data + 33); md->md_joffset = le64dec(data + 41); md->md_jid = le32dec(data + 49); md->md_flags = le64dec(data + 53); bcopy(data + 61, md->md_provider, 16); md->md_provsize = le64dec(data + 77); MD5Init(&ctx); MD5Update(&ctx, data, 85); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 85, 16) != 0) return (EINVAL); return (0); } static __inline int journal_metadata_decode(const u_char *data, struct g_journal_metadata *md) { int error; bcopy(data, md->md_magic, 16); md->md_version = le32dec(data + 16); switch (md->md_version) { case 0: error = journal_metadata_decode_v0(data, md); break; default: error = EINVAL; break; } return (error); } static __inline void journal_metadata_dump(const struct g_journal_metadata *md) { static const char hex[] = "0123456789abcdef"; char hash[16 * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" id: %u\n", (u_int)md->md_id); printf(" type: %u\n", (u_int)md->md_type); printf(" start: %ju\n", (uintmax_t)md->md_jstart); printf(" end: %ju\n", (uintmax_t)md->md_jend); printf(" joffset: %ju\n", (uintmax_t)md->md_joffset); printf(" jid: %u\n", (u_int)md->md_jid); printf(" flags: %u\n", (u_int)md->md_flags); printf("hcprovider: %s\n", md->md_provider); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); bzero(hash, sizeof(hash)); for (i = 0; i < 16; i++) { hash[i * 2] = hex[md->md_hash[i] >> 4]; hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", hash); } #endif /* !_G_JOURNAL_H_ */ Index: head/sys/geom/journal/g_journal_ufs.c =================================================================== --- head/sys/geom/journal/g_journal_ufs.c (revision 326269) +++ head/sys/geom/journal/g_journal_ufs.c (revision 326270) @@ -1,107 +1,109 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include static const int superblocks[] = SBLOCKSEARCH; static int g_journal_ufs_clean(struct mount *mp) { struct ufsmount *ump; struct fs *fs; int flags; ump = VFSTOUFS(mp); fs = ump->um_fs; flags = fs->fs_flags; fs->fs_flags &= ~(FS_UNCLEAN | FS_NEEDSFSCK); ffs_sbupdate(ump, MNT_WAIT, 1); fs->fs_flags = flags; return (0); } static void g_journal_ufs_dirty(struct g_consumer *cp) { struct fs *fs; int error, i, sb; if (SBLOCKSIZE % cp->provider->sectorsize != 0) return; for (i = 0; (sb = superblocks[i]) != -1; i++) { if (sb % cp->provider->sectorsize != 0) continue; fs = g_read_data(cp, sb, SBLOCKSIZE, NULL); if (fs == NULL) continue; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) { g_free(fs); continue; } GJ_DEBUG(0, "clean=%d flags=0x%x", fs->fs_clean, fs->fs_flags); fs->fs_clean = 0; fs->fs_flags |= FS_NEEDSFSCK | FS_UNCLEAN; error = g_write_data(cp, sb, fs, SBLOCKSIZE); g_free(fs); if (error != 0) { GJ_DEBUG(0, "Cannot mark file system %s as dirty " "(error=%d).", cp->provider->name, error); } else { GJ_DEBUG(0, "File system %s marked as dirty.", cp->provider->name); } } } const struct g_journal_desc g_journal_ufs = { .jd_fstype = "ufs", .jd_clean = g_journal_ufs_clean, .jd_dirty = g_journal_ufs_dirty }; MODULE_DEPEND(g_journal, ufs, 1, 1, 1); Index: head/sys/geom/label/g_label.c =================================================================== --- head/sys/geom/label/g_label.c (revision 326269) +++ head/sys/geom/label/g_label.c (revision 326270) @@ -1,552 +1,554 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_label, "GEOM labeling support"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, label, CTLFLAG_RW, 0, "GEOM_LABEL stuff"); u_int g_label_debug = 0; SYSCTL_UINT(_kern_geom_label, OID_AUTO, debug, CTLFLAG_RWTUN, &g_label_debug, 0, "Debug level"); static int g_label_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static int g_label_destroy(struct g_geom *gp, boolean_t force); static struct g_geom *g_label_taste(struct g_class *mp, struct g_provider *pp, int flags __unused); static void g_label_config(struct gctl_req *req, struct g_class *mp, const char *verb); struct g_class g_label_class = { .name = G_LABEL_CLASS_NAME, .version = G_VERSION, .ctlreq = g_label_config, .taste = g_label_taste, .destroy_geom = g_label_destroy_geom }; /* * To add a new file system where you want to look for volume labels, * you have to: * 1. Add a file g_label_.c which implements labels recognition. * 2. Add an 'extern const struct g_label_desc g_label_;' into * g_label.h file. * 3. Add an element to the table below '&g_label_,'. * 4. Add your file to sys/conf/files. * 5. Add your file to sys/modules/geom/geom_label/Makefile. * 6. Add your file system to manual page sbin/geom/class/label/glabel.8. */ const struct g_label_desc *g_labels[] = { &g_label_gpt, &g_label_gpt_uuid, #ifdef GEOM_LABEL &g_label_ufs_id, &g_label_ufs_volume, &g_label_iso9660, &g_label_msdosfs, &g_label_ext2fs, &g_label_reiserfs, &g_label_ntfs, &g_label_disk_ident, #endif NULL }; void g_label_rtrim(char *label, size_t size) { ptrdiff_t i; for (i = size - 1; i >= 0; i--) { if (label[i] == '\0') continue; else if (label[i] == ' ') label[i] = '\0'; else break; } } static int g_label_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp __unused) { /* * XXX: Unloading a class which is using geom_slice:1.56 is currently * XXX: broken, so we deny unloading when we have geoms. */ return (EOPNOTSUPP); } static void g_label_orphan(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s removed.", LIST_FIRST(&cp->geom->provider)->name); g_slice_orphan(cp); } static void g_label_spoiled(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s removed.", LIST_FIRST(&cp->geom->provider)->name); g_slice_spoiled(cp); } static void g_label_resize(struct g_consumer *cp) { G_LABEL_DEBUG(1, "Label %s resized.", LIST_FIRST(&cp->geom->provider)->name); g_slice_config(cp->geom, 0, G_SLICE_CONFIG_FORCE, (off_t)0, cp->provider->mediasize, cp->provider->sectorsize, "notused"); } static int g_label_is_name_ok(const char *label) { const char *s; /* Check if the label starts from ../ */ if (strncmp(label, "../", 3) == 0) return (0); /* Check if the label contains /../ */ if (strstr(label, "/../") != NULL) return (0); /* Check if the label ends at ../ */ if ((s = strstr(label, "/..")) != NULL && s[3] == '\0') return (0); return (1); } static void g_label_mangle_name(char *label, size_t size) { struct sbuf *sb; const u_char *c; sb = sbuf_new(NULL, NULL, size, SBUF_FIXEDLEN); for (c = label; *c != '\0'; c++) { if (!isprint(*c) || isspace(*c) || *c =='"' || *c == '%') sbuf_printf(sb, "%%%02X", *c); else sbuf_putc(sb, *c); } if (sbuf_finish(sb) != 0) label[0] = '\0'; else strlcpy(label, sbuf_data(sb), size); sbuf_delete(sb); } static struct g_geom * g_label_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, const char *label, const char *dir, off_t mediasize) { struct g_geom *gp; struct g_provider *pp2; struct g_consumer *cp; char name[64]; g_topology_assert(); if (!g_label_is_name_ok(label)) { G_LABEL_DEBUG(0, "%s contains suspicious label, skipping.", pp->name); G_LABEL_DEBUG(1, "%s suspicious label is: %s", pp->name, label); if (req != NULL) gctl_error(req, "Label name %s is invalid.", label); return (NULL); } gp = NULL; cp = NULL; snprintf(name, sizeof(name), "%s/%s", dir, label); LIST_FOREACH(gp, &mp->geom, geom) { pp2 = LIST_FIRST(&gp->provider); if (pp2 == NULL) continue; if ((pp2->flags & G_PF_ORPHAN) != 0) continue; if (strcmp(pp2->name, name) == 0) { G_LABEL_DEBUG(1, "Label %s(%s) already exists (%s).", label, name, pp->name); if (req != NULL) { gctl_error(req, "Provider %s already exists.", name); } return (NULL); } } gp = g_slice_new(mp, 1, pp, &cp, NULL, 0, NULL); if (gp == NULL) { G_LABEL_DEBUG(0, "Cannot create slice %s.", label); if (req != NULL) gctl_error(req, "Cannot create slice %s.", label); return (NULL); } gp->orphan = g_label_orphan; gp->spoiled = g_label_spoiled; gp->resize = g_label_resize; g_access(cp, -1, 0, 0); g_slice_config(gp, 0, G_SLICE_CONFIG_SET, (off_t)0, mediasize, pp->sectorsize, "%s", name); G_LABEL_DEBUG(1, "Label for provider %s is %s.", pp->name, name); return (gp); } static int g_label_destroy(struct g_geom *gp, boolean_t force) { struct g_provider *pp; g_topology_assert(); pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_LABEL_DEBUG(0, "Provider %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_LABEL_DEBUG(1, "Provider %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else if (pp != NULL) G_LABEL_DEBUG(1, "Label %s removed.", pp->name); g_slice_spoiled(LIST_FIRST(&gp->consumer)); return (0); } static int g_label_read_metadata(struct g_consumer *cp, struct g_label_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); if (buf == NULL) return (error); /* Decode metadata. */ label_metadata_decode(buf, md); g_free(buf); return (0); } static void g_label_orphan_taste(struct g_consumer *cp __unused) { KASSERT(1 == 0, ("%s called?", __func__)); } static void g_label_start_taste(struct bio *bp __unused) { KASSERT(1 == 0, ("%s called?", __func__)); } static int g_label_access_taste(struct g_provider *pp __unused, int dr __unused, int dw __unused, int de __unused) { KASSERT(1 == 0, ("%s called", __func__)); return (EOPNOTSUPP); } static struct g_geom * g_label_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_label_metadata md; struct g_consumer *cp; struct g_geom *gp; int i; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_LABEL_DEBUG(2, "Tasting %s.", pp->name); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); if (strcmp(pp->geom->class->name, mp->name) == 0) return (NULL); gp = g_new_geomf(mp, "label:taste"); gp->start = g_label_start_taste; gp->access = g_label_access_taste; gp->orphan = g_label_orphan_taste; cp = g_new_consumer(gp); g_attach(cp, pp); if (g_access(cp, 1, 0, 0) != 0) goto end; do { if (g_label_read_metadata(cp, &md) != 0) break; if (strcmp(md.md_magic, G_LABEL_MAGIC) != 0) break; if (md.md_version > G_LABEL_VERSION) { printf("geom_label.ko module is too old to handle %s.\n", pp->name); break; } /* * Backward compatibility: */ /* * There was no md_provsize field in earlier versions of * metadata. */ if (md.md_version < 2) md.md_provsize = pp->mediasize; if (md.md_provsize != pp->mediasize) break; g_label_create(NULL, mp, pp, md.md_label, G_LABEL_DIR, pp->mediasize - pp->sectorsize); } while (0); for (i = 0; g_labels[i] != NULL; i++) { char label[128]; if (g_labels[i]->ld_enabled == 0) continue; g_topology_unlock(); g_labels[i]->ld_taste(cp, label, sizeof(label)); g_label_mangle_name(label, sizeof(label)); g_topology_lock(); if (label[0] == '\0') continue; g_label_create(NULL, mp, pp, label, g_labels[i]->ld_dir, pp->mediasize); } g_access(cp, -1, 0, 0); end: g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static void g_label_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; const char *name; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } /* * arg1 is the name of provider. */ name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", 1); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_LABEL_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } /* * arg0 is the label. */ name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", 0); return; } g_label_create(req, mp, pp, name, G_LABEL_DIR, pp->mediasize); } static const char * g_label_skip_dir(const char *name) { char path[64]; u_int i; if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); if (strncmp(name, G_LABEL_DIR "/", strlen(G_LABEL_DIR "/")) == 0) name += strlen(G_LABEL_DIR "/"); for (i = 0; g_labels[i] != NULL; i++) { snprintf(path, sizeof(path), "%s/", g_labels[i]->ld_dir); if (strncmp(name, path, strlen(path)) == 0) { name += strlen(path); break; } } return (name); } static struct g_geom * g_label_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; struct g_provider *pp; const char *pname; name = g_label_skip_dir(name); LIST_FOREACH(gp, &mp->geom, geom) { pp = LIST_FIRST(&gp->provider); pname = g_label_skip_dir(pp->name); if (strcmp(pname, name) == 0) return (gp); } return (NULL); } static void g_label_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } gp = g_label_find_geom(mp, name); if (gp == NULL) { G_LABEL_DEBUG(1, "Label %s is invalid.", name); gctl_error(req, "Label %s is invalid.", name); return; } error = g_label_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy label %s (error=%d).", LIST_FIRST(&gp->provider)->name, error); return; } } } static void g_label_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_LABEL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_label_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_label_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } DECLARE_GEOM_CLASS(g_label_class, g_label); Index: head/sys/geom/label/g_label.h =================================================================== --- head/sys/geom/label/g_label.h (revision 326269) +++ head/sys/geom/label/g_label.h (revision 326270) @@ -1,117 +1,119 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_LABEL_H_ #define _G_LABEL_H_ #include #ifdef _KERNEL #include #endif #define G_LABEL_CLASS_NAME "LABEL" #define G_LABEL_MAGIC "GEOM::LABEL" /* * Version history: * 1 - Initial version number. * 2 - Added md_provsize field to metadata. */ #define G_LABEL_VERSION 2 #define G_LABEL_DIR "label" #ifdef _KERNEL extern u_int g_label_debug; #define G_LABEL_DEBUG(lvl, ...) do { \ if (g_label_debug >= (lvl)) { \ printf("GEOM_LABEL"); \ if (g_label_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) SYSCTL_DECL(_kern_geom_label); #define G_LABEL_INIT(kind, label, descr) \ SYSCTL_NODE(_kern_geom_label, OID_AUTO, kind, CTLFLAG_RD, \ NULL, ""); \ SYSCTL_INT(_kern_geom_label_##kind, OID_AUTO, enable, \ CTLFLAG_RWTUN, &label.ld_enabled, 1, descr) typedef void g_label_taste_t (struct g_consumer *cp, char *label, size_t size); struct g_label_desc { g_label_taste_t *ld_taste; char *ld_dir; int ld_enabled; }; /* Supported labels. */ extern struct g_label_desc g_label_ufs_id; extern struct g_label_desc g_label_ufs_volume; extern struct g_label_desc g_label_iso9660; extern struct g_label_desc g_label_msdosfs; extern struct g_label_desc g_label_ext2fs; extern struct g_label_desc g_label_reiserfs; extern struct g_label_desc g_label_ntfs; extern struct g_label_desc g_label_gpt; extern struct g_label_desc g_label_gpt_uuid; extern struct g_label_desc g_label_disk_ident; extern void g_label_rtrim(char *label, size_t size); #endif /* _KERNEL */ struct g_label_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_label[16]; /* Label. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void label_metadata_encode(const struct g_label_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_label, data + 20, sizeof(md->md_label)); le64enc(data + 36, md->md_provsize); } static __inline void label_metadata_decode(const u_char *data, struct g_label_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_label, sizeof(md->md_label)); md->md_provsize = le64dec(data + 36); } #endif /* _G_LABEL_H_ */ Index: head/sys/geom/label/g_label_disk_ident.c =================================================================== --- head/sys/geom/label/g_label_disk_ident.c (revision 326269) +++ head/sys/geom/label/g_label_disk_ident.c (revision 326270) @@ -1,88 +1,90 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #define G_LABEL_DISK_IDENT_DIR "diskid" static char* classes_pass[] = { G_DISK_CLASS_NAME, G_MULTIPATH_CLASS_NAME, NULL }; static void g_label_disk_ident_taste(struct g_consumer *cp, char *label, size_t size) { struct g_class *cls; char ident[100]; int ident_len, found, i; g_topology_assert_not(); label[0] = '\0'; cls = cp->provider->geom->class; /* * Get the GEOM::ident string, and construct a label in the format * "CLASS_NAME-ident" */ ident_len = sizeof(ident); if (g_io_getattr("GEOM::ident", cp, &ident_len, ident) == 0) { if (ident_len == 0 || ident[0] == '\0') return; for (i = 0, found = 0; classes_pass[i] != NULL; i++) if (strcmp(classes_pass[i], cls->name) == 0) { found = 1; break; } if (!found) return; /* * We can safely ignore the result of snprintf(): the label * will simply be truncated, which at most is only annoying. */ (void)snprintf(label, size, "%s-%s", cls->name, ident); } } struct g_label_desc g_label_disk_ident = { .ld_taste = g_label_disk_ident_taste, .ld_dir = G_LABEL_DISK_IDENT_DIR, .ld_enabled = 1 }; G_LABEL_INIT(disk_ident, g_label_disk_ident, "Create device nodes for drives " "which export a disk identification string"); Index: head/sys/geom/label/g_label_ext2fs.c =================================================================== --- head/sys/geom/label/g_label_ext2fs.c (revision 326269) +++ head/sys/geom/label/g_label_ext2fs.c (revision 326270) @@ -1,101 +1,103 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Stanislav Sedov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #define EXT2FS_SB_OFFSET 1024 #define EXT2_SUPER_MAGIC 0xef53 #define EXT2_DYNAMIC_REV 1 typedef struct e2sb { uint8_t fake1[56]; uint16_t s_magic; uint8_t fake2[18]; uint32_t s_rev_level; uint8_t fake3[40]; char s_volume_name[16]; } e2sb_t; static void g_label_ext2fs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; e2sb_t *fs; char *s_volume_name; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; if ((EXT2FS_SB_OFFSET % pp->sectorsize) != 0) return; fs = (e2sb_t *)g_read_data(cp, EXT2FS_SB_OFFSET, pp->sectorsize, NULL); if (fs == NULL) return; /* Check for magic and versio n*/ if (fs->s_magic == EXT2_SUPER_MAGIC && fs->s_rev_level == EXT2_DYNAMIC_REV) { G_LABEL_DEBUG(1, "ext2fs file system detected on %s.", pp->name); } else { goto exit_free; } s_volume_name = fs->s_volume_name; /* Terminate label */ s_volume_name[sizeof(fs->s_volume_name) - 1] = '\0'; if (s_volume_name[0] == '/') s_volume_name += 1; /* Check for volume label */ if (s_volume_name[0] == '\0') goto exit_free; strlcpy(label, s_volume_name, size); exit_free: g_free(fs); } struct g_label_desc g_label_ext2fs = { .ld_taste = g_label_ext2fs_taste, .ld_dir = "ext2fs", .ld_enabled = 1 }; G_LABEL_INIT(ext2fs, g_label_ext2fs, "Create device nodes for EXT2FS volumes"); Index: head/sys/geom/label/g_label_gpt.c =================================================================== --- head/sys/geom/label/g_label_gpt.c (revision 326269) +++ head/sys/geom/label/g_label_gpt.c (revision 326270) @@ -1,170 +1,172 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Marius Nuennerich * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #define PART_CLASS_NAME "PART" #define SCHEME_NAME "GPT" #define G_LABEL_GPT_VOLUME_DIR "gpt" #define G_LABEL_GPT_ID_DIR "gptid" /* XXX: Also defined in geom/part/g_part_gpt.c */ struct g_part_gpt_entry { struct g_part_entry base; struct gpt_ent ent; }; /* XXX: Shamelessly stolen from g_part_gpt.c */ static void sbuf_nprintf_utf16(struct sbuf *sb, uint16_t *str, size_t len) { u_int bo; uint32_t ch; uint16_t c; bo = LITTLE_ENDIAN; /* GPT is little-endian */ while (len > 0 && *str != 0) { ch = (bo == BIG_ENDIAN) ? be16toh(*str) : le16toh(*str); str++, len--; if ((ch & 0xf800) == 0xd800) { if (len > 0) { c = (bo == BIG_ENDIAN) ? be16toh(*str) : le16toh(*str); str++, len--; } else c = 0xfffd; if ((ch & 0x400) == 0 && (c & 0xfc00) == 0xdc00) { ch = ((ch & 0x3ff) << 10) + (c & 0x3ff); ch += 0x10000; } else ch = 0xfffd; } else if (ch == 0xfffe) { /* BOM (U+FEFF) swapped. */ bo = (bo == BIG_ENDIAN) ? LITTLE_ENDIAN : BIG_ENDIAN; continue; } else if (ch == 0xfeff) /* BOM (U+FEFF) unswapped. */ continue; /* Write the Unicode character in UTF-8 */ if (ch < 0x80) sbuf_printf(sb, "%c", ch); else if (ch < 0x800) sbuf_printf(sb, "%c%c", 0xc0 | (ch >> 6), 0x80 | (ch & 0x3f)); else if (ch < 0x10000) sbuf_printf(sb, "%c%c%c", 0xe0 | (ch >> 12), 0x80 | ((ch >> 6) & 0x3f), 0x80 | (ch & 0x3f)); else if (ch < 0x200000) sbuf_printf(sb, "%c%c%c%c", 0xf0 | (ch >> 18), 0x80 | ((ch >> 12) & 0x3f), 0x80 | ((ch >> 6) & 0x3f), 0x80 | (ch & 0x3f)); } } static void g_label_gpt_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; struct g_part_table *tp; struct g_part_gpt_entry *part_gpt_entry; struct sbuf *lbl; g_topology_assert_not(); pp = cp->provider; tp = (struct g_part_table *)pp->geom->softc; label[0] = '\0'; /* We taste only partitions handled by GPART */ if (strncmp(pp->geom->class->name, PART_CLASS_NAME, sizeof(PART_CLASS_NAME))) return; /* and only GPT */ if (strncmp(tp->gpt_scheme->name, SCHEME_NAME, sizeof(SCHEME_NAME))) return; part_gpt_entry = (struct g_part_gpt_entry *)pp->private; /* * Create sbuf with biggest possible size. * We need max. 4 bytes for every 2-byte utf16 char. */ lbl = sbuf_new(NULL, NULL, sizeof(part_gpt_entry->ent.ent_name) << 1, SBUF_FIXEDLEN); /* Size is the number of characters, not bytes */ sbuf_nprintf_utf16(lbl, part_gpt_entry->ent.ent_name, sizeof(part_gpt_entry->ent.ent_name) >> 1); sbuf_finish(lbl); strlcpy(label, sbuf_data(lbl), size); sbuf_delete(lbl); } static void g_label_gpt_uuid_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; struct g_part_table *tp; struct g_part_gpt_entry *part_gpt_entry; g_topology_assert_not(); pp = cp->provider; tp = (struct g_part_table *)pp->geom->softc; label[0] = '\0'; /* We taste only partitions handled by GPART */ if (strncmp(pp->geom->class->name, PART_CLASS_NAME, sizeof(PART_CLASS_NAME))) return; /* and only GPT */ if (strncmp(tp->gpt_scheme->name, SCHEME_NAME, sizeof(SCHEME_NAME))) return; part_gpt_entry = (struct g_part_gpt_entry *)pp->private; snprintf_uuid(label, size, &part_gpt_entry->ent.ent_uuid); } struct g_label_desc g_label_gpt = { .ld_taste = g_label_gpt_taste, .ld_dir = G_LABEL_GPT_VOLUME_DIR, .ld_enabled = 1 }; struct g_label_desc g_label_gpt_uuid = { .ld_taste = g_label_gpt_uuid_taste, .ld_dir = G_LABEL_GPT_ID_DIR, .ld_enabled = 1 }; G_LABEL_INIT(gpt, g_label_gpt, "Create device nodes for GPT labels"); G_LABEL_INIT(gptid, g_label_gpt_uuid, "Create device nodes for GPT UUIDs"); Index: head/sys/geom/label/g_label_iso9660.c =================================================================== --- head/sys/geom/label/g_label_iso9660.c (revision 326269) +++ head/sys/geom/label/g_label_iso9660.c (revision 326270) @@ -1,79 +1,81 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #define G_LABEL_ISO9660_DIR "iso9660" #define ISO9660_MAGIC "\x01" "CD001" "\x01\x00" #define ISO9660_OFFSET 0x8000 #define VOLUME_LEN 32 static void g_label_iso9660_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; char *sector, *volume; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; if ((ISO9660_OFFSET % pp->sectorsize) != 0) return; sector = (char *)g_read_data(cp, ISO9660_OFFSET, pp->sectorsize, NULL); if (sector == NULL) return; if (bcmp(sector, ISO9660_MAGIC, sizeof(ISO9660_MAGIC) - 1) != 0) { g_free(sector); return; } G_LABEL_DEBUG(1, "ISO9660 file system detected on %s.", pp->name); volume = sector + 0x28; bzero(label, size); strlcpy(label, volume, MIN(size, VOLUME_LEN)); g_free(sector); g_label_rtrim(label, size); } struct g_label_desc g_label_iso9660 = { .ld_taste = g_label_iso9660_taste, .ld_dir = G_LABEL_ISO9660_DIR, .ld_enabled = 1 }; G_LABEL_INIT(iso9660, g_label_iso9660, "Create device nodes for ISO9660 volume names"); Index: head/sys/geom/label/g_label_msdosfs.c =================================================================== --- head/sys/geom/label/g_label_msdosfs.c (revision 326269) +++ head/sys/geom/label/g_label_msdosfs.c (revision 326270) @@ -1,217 +1,219 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 Pawel Jakub Dawidek * Copyright (c) 2006 Tobias Reifenberger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #define G_LABEL_MSDOSFS_DIR "msdosfs" #define LABEL_NO_NAME "NO NAME " static void g_label_msdosfs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; FAT_BSBPB *pfat_bsbpb; FAT32_BSBPB *pfat32_bsbpb; FAT_DES *pfat_entry; uint8_t *sector0, *sector; g_topology_assert_not(); pp = cp->provider; sector0 = NULL; sector = NULL; bzero(label, size); /* Check if the sector size of the medium is a valid FAT sector size. */ switch(pp->sectorsize) { case 512: case 1024: case 2048: case 4096: break; default: G_LABEL_DEBUG(1, "MSDOSFS: %s: sector size %d not compatible.", pp->name, pp->sectorsize); return; } /* Load 1st sector with boot sector and boot parameter block. */ sector0 = (uint8_t *)g_read_data(cp, 0, pp->sectorsize, NULL); if (sector0 == NULL) return; /* Check for the FAT boot sector signature. */ if (sector0[510] != 0x55 || sector0[511] != 0xaa) { G_LABEL_DEBUG(1, "MSDOSFS: %s: no FAT signature found.", pp->name); goto error; } /* * Test if this is really a FAT volume and determine the FAT type. */ pfat_bsbpb = (FAT_BSBPB *)sector0; pfat32_bsbpb = (FAT32_BSBPB *)sector0; if (UINT16BYTES(pfat_bsbpb->BPB_FATSz16) != 0) { /* * If the BPB_FATSz16 field is not zero and the string "FAT" is * at the right place, this should be a FAT12 or FAT16 volume. */ if (strncmp(pfat_bsbpb->BS_FilSysType, "FAT", 3) != 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/16 volume not valid.", pp->name); goto error; } G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/FAT16 volume detected.", pp->name); /* A volume with no name should have "NO NAME " as label. */ if (strncmp(pfat_bsbpb->BS_VolLab, LABEL_NO_NAME, sizeof(pfat_bsbpb->BS_VolLab)) == 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT12/16 volume has no name.", pp->name); goto error; } strlcpy(label, pfat_bsbpb->BS_VolLab, MIN(size, sizeof(pfat_bsbpb->BS_VolLab) + 1)); } else if (UINT32BYTES(pfat32_bsbpb->BPB_FATSz32) != 0) { uint32_t fat_FirstDataSector, fat_BytesPerSector, offset; /* * If the BPB_FATSz32 field is not zero and the string "FAT" is * at the right place, this should be a FAT32 volume. */ if (strncmp(pfat32_bsbpb->BS_FilSysType, "FAT", 3) != 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT32 volume not valid.", pp->name); goto error; } G_LABEL_DEBUG(1, "MSDOSFS: %s: FAT32 volume detected.", pp->name); /* * If the volume label is not "NO NAME " we're done. */ if (strncmp(pfat32_bsbpb->BS_VolLab, LABEL_NO_NAME, sizeof(pfat32_bsbpb->BS_VolLab)) != 0) { strlcpy(label, pfat32_bsbpb->BS_VolLab, MIN(size, sizeof(pfat32_bsbpb->BS_VolLab) + 1)); goto endofchecks; } /* * If the volume label "NO NAME " is in the boot sector, the * label of FAT32 volumes may be stored as a special entry in * the root directory. */ fat_FirstDataSector = UINT16BYTES(pfat32_bsbpb->BPB_RsvdSecCnt) + (pfat32_bsbpb->BPB_NumFATs * UINT32BYTES(pfat32_bsbpb->BPB_FATSz32)); fat_BytesPerSector = UINT16BYTES(pfat32_bsbpb->BPB_BytsPerSec); G_LABEL_DEBUG(2, "MSDOSFS: FAT_FirstDataSector=0x%x, FAT_BytesPerSector=%d", fat_FirstDataSector, fat_BytesPerSector); for (offset = fat_BytesPerSector * fat_FirstDataSector;; offset += fat_BytesPerSector) { sector = (uint8_t *)g_read_data(cp, offset, fat_BytesPerSector, NULL); if (sector == NULL) goto error; pfat_entry = (FAT_DES *)sector; do { /* No more entries available. */ if (pfat_entry->DIR_Name[0] == 0) { G_LABEL_DEBUG(1, "MSDOSFS: %s: " "FAT32 volume has no name.", pp->name); goto error; } /* Skip empty or long name entries. */ if (pfat_entry->DIR_Name[0] == 0xe5 || (pfat_entry->DIR_Attr & FAT_DES_ATTR_LONG_NAME) == FAT_DES_ATTR_LONG_NAME) { continue; } /* * The name of the entry is the volume label if * ATTR_VOLUME_ID is set. */ if (pfat_entry->DIR_Attr & FAT_DES_ATTR_VOLUME_ID) { strlcpy(label, pfat_entry->DIR_Name, MIN(size, sizeof(pfat_entry->DIR_Name) + 1)); goto endofchecks; } } while((uint8_t *)(++pfat_entry) < (uint8_t *)(sector + fat_BytesPerSector)); g_free(sector); } } else { G_LABEL_DEBUG(1, "MSDOSFS: %s: no FAT volume detected.", pp->name); goto error; } endofchecks: g_label_rtrim(label, size); error: if (sector0 != NULL) g_free(sector0); if (sector != NULL) g_free(sector); } struct g_label_desc g_label_msdosfs = { .ld_taste = g_label_msdosfs_taste, .ld_dir = G_LABEL_MSDOSFS_DIR, .ld_enabled = 1 }; G_LABEL_INIT(msdosfs, g_label_msdosfs, "Create device nodes for MSDOSFS volumes"); Index: head/sys/geom/label/g_label_msdosfs.h =================================================================== --- head/sys/geom/label/g_label_msdosfs.h (revision 326269) +++ head/sys/geom/label/g_label_msdosfs.h (revision 326270) @@ -1,140 +1,142 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006 Tobias Reifenberger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include /* * Conversion macros for little endian encoded unsigned integers * in byte streams to the local unsigned integer format. */ #define UINT16BYTES(p) ((uint32_t)((p)[0] + (256*(p)[1]))) #define UINT32BYTES(p) ((uint32_t)((p)[0] + (256*(p)[1]) + \ (65536*(p)[2]) + (16777216*(p)[3]))) /* * All following structures are according to: * * Microsoft Extensible Firmware Initiative FAT32 File System Specification * FAT: General Overview of On-Disk Format * Version 1.03, December 6, 2000 * Microsoft Corporation */ /* * FAT boot sector and boot parameter block for * FAT12 and FAT16 volumes */ typedef struct fat_bsbpb { /* common fields */ uint8_t BS_jmpBoot[3]; uint8_t BS_OEMName[8]; uint8_t BPB_BytsPerSec[2]; uint8_t BPB_SecPerClus; uint8_t BPB_RsvdSecCnt[2]; uint8_t BPB_NumFATs; uint8_t BPB_RootEntCnt[2]; uint8_t BPB_TotSec16[2]; uint8_t BPB_Media; uint8_t BPB_FATSz16[2]; uint8_t BPB_SecPerTrack[2]; uint8_t BPB_NumHeads[2]; uint8_t BPB_HiddSec[4]; uint8_t BPB_TotSec32[4]; /* FAT12/FAT16 only fields */ uint8_t BS_DrvNum; uint8_t BS_Reserved1; uint8_t BS_BootSig; uint8_t BS_VolID[4]; uint8_t BS_VolLab[11]; uint8_t BS_FilSysType[8]; } FAT_BSBPB; /* 62 bytes */ /* * FAT boot sector and boot parameter block for * FAT32 volumes */ typedef struct fat32_bsbpb { /* common fields */ uint8_t BS_jmpBoot[3]; uint8_t BS_OEMName[8]; uint8_t BPB_BytsPerSec[2]; uint8_t BPB_SecPerClus; uint8_t BPB_RsvdSecCnt[2]; uint8_t BPB_NumFATs; uint8_t BPB_RootEntCnt[2]; uint8_t BPB_TotSec16[2]; uint8_t BPB_Media; uint8_t BPB_FATSz16[2]; uint8_t BPB_SecPerTrack[2]; uint8_t BPB_NumHeads[2]; uint8_t BPB_HiddSec[4]; uint8_t BPB_TotSec32[4]; /* FAT32 only fields */ uint8_t BPB_FATSz32[4]; uint8_t BPB_ExtFlags[2]; uint8_t BPB_FSVer[2]; uint8_t BPB_RootClus[4]; uint8_t BPB_FSInfo[2]; uint8_t BPB_BkBootSec[2]; uint8_t BPB_Reserved[12]; uint8_t BS_DrvNum; uint8_t BS_Reserved1; uint8_t BS_BootSig; uint8_t BS_VolID[4]; uint8_t BS_VolLab[11]; uint8_t BS_FilSysType[8]; } FAT32_BSBPB; /* 90 bytes */ /* * FAT directory entry structure */ #define FAT_DES_ATTR_READ_ONLY 0x01 #define FAT_DES_ATTR_HIDDEN 0x02 #define FAT_DES_ATTR_SYSTEM 0x04 #define FAT_DES_ATTR_VOLUME_ID 0x08 #define FAT_DES_ATTR_DIRECTORY 0x10 #define FAT_DES_ATTR_ARCHIVE 0x20 #define FAT_DES_ATTR_LONG_NAME (FAT_DES_ATTR_READ_ONLY | \ FAT_DES_ATTR_HIDDEN | \ FAT_DES_ATTR_SYSTEM | \ FAT_DES_ATTR_VOLUME_ID) typedef struct fat_des { uint8_t DIR_Name[11]; uint8_t DIR_Attr; uint8_t DIR_NTRes; uint8_t DIR_CrtTimeTenth; uint8_t DIR_CrtTime[2]; uint8_t DIR_CrtDate[2]; uint8_t DIR_LstAccDate[2]; uint8_t DIR_FstClusHI[2]; uint8_t DIR_WrtTime[2]; uint8_t DIR_WrtDate[2]; uint8_t DIR_FstClusLO[2]; uint8_t DIR_FileSize[4]; } FAT_DES; Index: head/sys/geom/label/g_label_ntfs.c =================================================================== --- head/sys/geom/label/g_label_ntfs.c (revision 326269) +++ head/sys/geom/label/g_label_ntfs.c (revision 326270) @@ -1,175 +1,177 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Takanori Watanabe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #define NTFS_A_VOLUMENAME 0x60 #define NTFS_FILEMAGIC ((uint32_t)(0x454C4946)) #define NTFS_VOLUMEINO 3 #define G_LABEL_NTFS_DIR "ntfs" struct ntfs_attr { uint32_t a_type; uint32_t reclen; uint8_t a_flag; uint8_t a_namelen; uint8_t a_nameoff; uint8_t reserved1; uint8_t a_compression; uint8_t reserved2; uint16_t a_index; uint16_t a_datalen; uint16_t reserved3; uint16_t a_dataoff; uint16_t a_indexed; } __packed; struct ntfs_filerec { uint32_t fr_hdrmagic; uint16_t fr_hdrfoff; uint16_t fr_hdrfnum; uint8_t reserved[8]; uint16_t fr_seqnum; uint16_t fr_nlink; uint16_t fr_attroff; uint16_t fr_flags; uint32_t fr_size; uint32_t fr_allocated; uint64_t fr_mainrec; uint16_t fr_attrnum; } __packed; struct ntfs_bootfile { uint8_t reserved1[3]; uint8_t bf_sysid[8]; uint16_t bf_bps; uint8_t bf_spc; uint8_t reserved2[7]; uint8_t bf_media; uint8_t reserved3[2]; uint16_t bf_spt; uint16_t bf_heads; uint8_t reserver4[12]; uint64_t bf_spv; uint64_t bf_mftcn; uint64_t bf_mftmirrcn; int8_t bf_mftrecsz; uint32_t bf_ibsz; uint32_t bf_volsn; } __packed; static void g_label_ntfs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; struct ntfs_bootfile *bf; struct ntfs_filerec *fr; struct ntfs_attr *atr; off_t voloff; char *filerecp, *ap; int8_t mftrecsz; char vnchar; int recsize, j; g_topology_assert_not(); label[0] = '\0'; pp = cp->provider; filerecp = NULL; bf = (struct ntfs_bootfile *)g_read_data(cp, 0, pp->sectorsize, NULL); if (bf == NULL || strncmp(bf->bf_sysid, "NTFS ", 8) != 0) goto done; mftrecsz = bf->bf_mftrecsz; recsize = (mftrecsz > 0) ? (mftrecsz * bf->bf_bps * bf->bf_spc) : (1 << -mftrecsz); if (recsize == 0 || recsize % pp->sectorsize != 0) goto done; voloff = bf->bf_mftcn * bf->bf_spc * bf->bf_bps + recsize * NTFS_VOLUMEINO; if (voloff % pp->sectorsize != 0) goto done; filerecp = g_read_data(cp, voloff, recsize, NULL); if (filerecp == NULL) goto done; fr = (struct ntfs_filerec *)filerecp; if (fr->fr_hdrmagic != NTFS_FILEMAGIC) goto done; for (ap = filerecp + fr->fr_attroff; atr = (struct ntfs_attr *)ap, atr->a_type != -1; ap += atr->reclen) { if (atr->a_type == NTFS_A_VOLUMENAME) { if(atr->a_datalen >= size *2){ label[0] = 0; goto done; } /* *UNICODE to ASCII. * Should we need to use iconv(9)? */ for (j = 0; j < atr->a_datalen; j++) { vnchar = *(ap + atr->a_dataoff + j); if (j & 1) { if (vnchar) { label[0] = 0; goto done; } } else { label[j / 2] = vnchar; } } label[j / 2] = 0; break; } } done: if (bf != NULL) g_free(bf); if (filerecp != NULL) g_free(filerecp); } struct g_label_desc g_label_ntfs = { .ld_taste = g_label_ntfs_taste, .ld_dir = G_LABEL_NTFS_DIR, .ld_enabled = 1 }; G_LABEL_INIT(ntfs, g_label_ntfs, "Create device nodes for NTFS volumes"); Index: head/sys/geom/label/g_label_reiserfs.c =================================================================== --- head/sys/geom/label/g_label_reiserfs.c (revision 326269) +++ head/sys/geom/label/g_label_reiserfs.c (revision 326270) @@ -1,120 +1,122 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Stanislav Sedov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #define REISERFS_NEW_DISK_OFFSET 64 * 1024 #define REISERFS_OLD_DISK_OFFSET 8 * 1024 #define REISERFS_SUPER_MAGIC "ReIsEr" typedef struct reiserfs_sb { uint8_t fake1[52]; char s_magic[10]; uint8_t fake2[10]; uint16_t s_version; uint8_t fake3[26]; char s_volume_name[16]; } reiserfs_sb_t; static reiserfs_sb_t * g_label_reiserfs_read_super(struct g_consumer *cp, off_t offset) { reiserfs_sb_t *fs; u_int secsize; secsize = cp->provider->sectorsize; if ((offset % secsize) != 0) return (NULL); fs = (reiserfs_sb_t *)g_read_data(cp, offset, secsize, NULL); if (fs == NULL) return (NULL); if (strncmp(fs->s_magic, REISERFS_SUPER_MAGIC, strlen(REISERFS_SUPER_MAGIC)) != 0) { g_free(fs); return (NULL); } return (fs); } static void g_label_reiserfs_taste(struct g_consumer *cp, char *label, size_t size) { struct g_provider *pp; reiserfs_sb_t *fs; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; /* Try old format */ fs = g_label_reiserfs_read_super(cp, REISERFS_OLD_DISK_OFFSET); if (fs == NULL) { /* Try new format */ fs = g_label_reiserfs_read_super(cp, REISERFS_NEW_DISK_OFFSET); } if (fs == NULL) return; /* Check version */ if (fs->s_version == 2) { G_LABEL_DEBUG(1, "reiserfs file system detected on %s.", pp->name); } else { goto exit_free; } /* Check for volume label */ if (fs->s_volume_name[0] == '\0') goto exit_free; /* Terminate label */ fs->s_volume_name[sizeof(fs->s_volume_name) - 1] = '\0'; strlcpy(label, fs->s_volume_name, size); exit_free: g_free(fs); } struct g_label_desc g_label_reiserfs = { .ld_taste = g_label_reiserfs_taste, .ld_dir = "reiserfs", .ld_enabled = 1 }; G_LABEL_INIT(reiserfs, g_label_reiserfs, "Create device nodes for REISERFS volumes"); Index: head/sys/geom/label/g_label_ufs.c =================================================================== --- head/sys/geom/label/g_label_ufs.c (revision 326269) +++ head/sys/geom/label/g_label_ufs.c (revision 326270) @@ -1,186 +1,188 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002, 2003 Gordon Tetlow * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #define G_LABEL_UFS_VOLUME_DIR "ufs" #define G_LABEL_UFS_ID_DIR "ufsid" #define G_LABEL_UFS_VOLUME 0 #define G_LABEL_UFS_ID 1 /* * G_LABEL_UFS_CMP returns true if difference between provider mediasize * and filesystem size is less than G_LABEL_UFS_MAXDIFF sectors */ #define G_LABEL_UFS_CMP(prov, fsys, size) \ ( abs( ((fsys)->size) - ( (prov)->mediasize / (fsys)->fs_fsize )) \ < G_LABEL_UFS_MAXDIFF ) #define G_LABEL_UFS_MAXDIFF 0x100 static const int superblocks[] = SBLOCKSEARCH; static void g_label_ufs_taste_common(struct g_consumer *cp, char *label, size_t size, int what) { struct g_provider *pp; int sb, superblock; struct fs *fs; g_topology_assert_not(); pp = cp->provider; label[0] = '\0'; if (SBLOCKSIZE % cp->provider->sectorsize != 0) return; /* * Walk through the standard places that superblocks hide and look * for UFS magic. If we find magic, then check that the size in the * superblock corresponds to the size of the underlying provider. * Finally, look for a volume label and create an appropriate * provider based on that. */ for (sb = 0; (superblock = superblocks[sb]) != -1; sb++) { /* * Take care not to issue an invalid I/O request. The offset of * the superblock candidate must be multiples of the provider's * sector size, otherwise an FFS can't exist on the provider * anyway. */ if (superblock % cp->provider->sectorsize != 0) continue; fs = (struct fs *)g_read_data(cp, superblock, SBLOCKSIZE, NULL); if (fs == NULL) continue; /* * Check for magic. We also need to check if file system size * is almost equal to providers size, because sysinstall(8) * used to bogusly put first partition at offset 0 * instead of 16, and glabel/ufs would find file system on slice * instead of partition. * * In addition, media size can be a bit bigger than file system * size. For instance, mkuzip can append bytes to align data * to large sector size (it improves compression rates). */ switch (fs->fs_magic){ case FS_UFS1_MAGIC: case FS_UFS2_MAGIC: G_LABEL_DEBUG(1, "%s %s params: %jd, %d, %d, %jd\n", fs->fs_magic == FS_UFS1_MAGIC ? "UFS1" : "UFS2", pp->name, pp->mediasize, fs->fs_fsize, fs->fs_old_size, fs->fs_providersize); break; default: break; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_fsize > 0 && ( G_LABEL_UFS_CMP(pp, fs, fs_old_size) || G_LABEL_UFS_CMP(pp, fs, fs_providersize))) { /* Valid UFS1. */ } else if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_fsize > 0 && ( G_LABEL_UFS_CMP(pp, fs, fs_size) || G_LABEL_UFS_CMP(pp, fs, fs_providersize))) { /* Valid UFS2. */ } else { g_free(fs); continue; } if (fs->fs_sblockloc != superblock || fs->fs_ncg < 1 || fs->fs_bsize < MINBSIZE || fs->fs_bsize < sizeof(struct fs)) { g_free(fs); continue; } G_LABEL_DEBUG(1, "%s file system detected on %s.", fs->fs_magic == FS_UFS1_MAGIC ? "UFS1" : "UFS2", pp->name); switch (what) { case G_LABEL_UFS_VOLUME: /* Check for volume label */ if (fs->fs_volname[0] == '\0') { g_free(fs); continue; } strlcpy(label, fs->fs_volname, size); break; case G_LABEL_UFS_ID: if (fs->fs_id[0] == 0 && fs->fs_id[1] == 0) { g_free(fs); continue; } snprintf(label, size, "%08x%08x", fs->fs_id[0], fs->fs_id[1]); break; } g_free(fs); break; } } static void g_label_ufs_volume_taste(struct g_consumer *cp, char *label, size_t size) { g_label_ufs_taste_common(cp, label, size, G_LABEL_UFS_VOLUME); } static void g_label_ufs_id_taste(struct g_consumer *cp, char *label, size_t size) { g_label_ufs_taste_common(cp, label, size, G_LABEL_UFS_ID); } struct g_label_desc g_label_ufs_volume = { .ld_taste = g_label_ufs_volume_taste, .ld_dir = G_LABEL_UFS_VOLUME_DIR, .ld_enabled = 1 }; struct g_label_desc g_label_ufs_id = { .ld_taste = g_label_ufs_id_taste, .ld_dir = G_LABEL_UFS_ID_DIR, .ld_enabled = 1 }; G_LABEL_INIT(ufsid, g_label_ufs_id, "Create device nodes for UFS file system IDs"); G_LABEL_INIT(ufs, g_label_ufs_volume, "Create device nodes for UFS volume names"); Index: head/sys/geom/linux_lvm/g_linux_lvm.c =================================================================== --- head/sys/geom/linux_lvm/g_linux_lvm.c (revision 326269) +++ head/sys/geom/linux_lvm/g_linux_lvm.c (revision 326270) @@ -1,1190 +1,1192 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Andrew Thompson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_linux_lvm, "GEOM Linux LVM partitioning support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GLLVM, "gllvm", "GEOM_LINUX_LVM Data"); /* GEOM class methods */ static g_access_t g_llvm_access; static g_init_t g_llvm_init; static g_orphan_t g_llvm_orphan; static g_orphan_t g_llvm_taste_orphan; static g_start_t g_llvm_start; static g_taste_t g_llvm_taste; static g_ctl_destroy_geom_t g_llvm_destroy_geom; static void g_llvm_done(struct bio *); static void g_llvm_remove_disk(struct g_llvm_vg *, struct g_consumer *); static int g_llvm_activate_lv(struct g_llvm_vg *, struct g_llvm_lv *); static int g_llvm_add_disk(struct g_llvm_vg *, struct g_provider *, char *); static void g_llvm_free_vg(struct g_llvm_vg *); static int g_llvm_destroy(struct g_llvm_vg *, int); static int g_llvm_read_label(struct g_consumer *, struct g_llvm_label *); static int g_llvm_read_md(struct g_consumer *, struct g_llvm_metadata *, struct g_llvm_label *); static int llvm_label_decode(const u_char *, struct g_llvm_label *, int); static int llvm_md_decode(const u_char *, struct g_llvm_metadata *, struct g_llvm_label *); static int llvm_textconf_decode(u_char *, int, struct g_llvm_metadata *); static int llvm_textconf_decode_pv(char **, char *, struct g_llvm_vg *); static int llvm_textconf_decode_lv(char **, char *, struct g_llvm_vg *); static int llvm_textconf_decode_sg(char **, char *, struct g_llvm_lv *); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, linux_lvm, CTLFLAG_RW, 0, "GEOM_LINUX_LVM stuff"); static u_int g_llvm_debug = 0; SYSCTL_UINT(_kern_geom_linux_lvm, OID_AUTO, debug, CTLFLAG_RWTUN, &g_llvm_debug, 0, "Debug level"); LIST_HEAD(, g_llvm_vg) vg_list; /* * Called to notify geom when it's been opened, and for what intent */ static int g_llvm_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c; struct g_llvm_vg *vg; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); vg = gp->softc; if (vg == NULL) { /* It seems that .access can be called with negative dr,dw,dx * in this case but I want to check for myself */ G_LLVM_DEBUG(0, "access(%d, %d, %d) for %s", dr, dw, de, pp->name); /* This should only happen when geom is withered so * allow only negative requests */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("%s: Positive access for %s", __func__, pp->name)); if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) G_LLVM_DEBUG(0, "Device %s definitely destroyed", pp->name); return (0); } /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) de--; error = ENXIO; LIST_FOREACH(c, &gp->consumer, consumer) { KASSERT(c != NULL, ("%s: consumer is NULL", __func__)); error = g_access(c, dr, dw, de); if (error != 0) { struct g_consumer *c2; /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) /* all eariler components fixed */ return (error); g_access(c2, -dr, -dw, -de); } } } return (error); } /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_llvm_done(struct bio *b) { struct bio *parent_b; parent_b = b->bio_parent; if (b->bio_error != 0) { G_LLVM_DEBUG(0, "Error %d for offset=%ju, length=%ju on %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } static void g_llvm_start(struct bio *bp) { struct g_provider *pp; struct g_llvm_vg *vg; struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; struct bio *cb; struct bio_queue_head bq; size_t chunk_size; off_t offset, length; char *addr; u_int count; pp = bp->bio_to; lv = pp->private; vg = pp->geom->softc; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* XXX BIO_GETATTR allowed? */ break; default: g_io_deliver(bp, EOPNOTSUPP); return; } bioq_init(&bq); chunk_size = vg->vg_extentsize; addr = bp->bio_data; offset = bp->bio_offset; /* virtual offset and length */ length = bp->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; pv = NULL; cb = g_clone_bio(bp); if (cb == NULL) { bioq_dismantle(&bq); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* get the segment and the pv */ if (lv->lv_sgcount == 1) { /* skip much of the calculations for a single sg */ chunk_index = 0; in_chunk_offset = 0; in_chunk_length = length; sg = lv->lv_firstsg; pv = sg->sg_pv; cb->bio_offset = offset + sg->sg_pvoffset; } else { chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); /* XXX could be faster */ LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (chunk_index >= sg->sg_start && chunk_index <= sg->sg_end) { /* adjust chunk index for sg start */ chunk_index -= sg->sg_start; pv = sg->sg_pv; break; } } cb->bio_offset = (off_t)chunk_index * (off_t)chunk_size + in_chunk_offset + sg->sg_pvoffset; } KASSERT(pv != NULL, ("Can't find PV for chunk %zu", chunk_index)); cb->bio_to = pv->pv_gprov; cb->bio_done = g_llvm_done; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = pv; bioq_disksort(&bq, cb); G_LLVM_DEBUG(5, "Mapped %s(%ju, %ju) on %s to %zu(%zu,%zu) @ %s:%ju", bp->bio_cmd == BIO_READ ? "R" : "W", offset, length, lv->lv_name, chunk_index, in_chunk_offset, in_chunk_length, pv->pv_name, cb->bio_offset); addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); pv = cb->bio_caller1; cb->bio_caller1 = NULL; G_LLVM_DEBUG(6, "firing bio to %s, offset=%ju, length=%ju", cb->bio_to->name, cb->bio_offset, cb->bio_length); g_io_request(cb, pv->pv_gcons); count++; } if (count == 0) { /* We handled everything locally */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); } } static void g_llvm_remove_disk(struct g_llvm_vg *vg, struct g_consumer *cp) { struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; int found; KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); pv = (struct g_llvm_pv *)cp->private; G_LLVM_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, pv->pv_name); LIST_FOREACH(lv, &vg->vg_lvs, lv_next) { /* Find segments that map to this disk */ found = 0; LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (sg->sg_pv == pv) { sg->sg_pv = NULL; lv->lv_sgactive--; found = 1; break; } } if (found) { G_LLVM_DEBUG(0, "Device %s removed.", lv->lv_gprov->name); g_wither_provider(lv->lv_gprov, ENXIO); lv->lv_gprov = NULL; } } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); } static void g_llvm_orphan(struct g_consumer *cp) { struct g_llvm_vg *vg; struct g_geom *gp; g_topology_assert(); gp = cp->geom; vg = gp->softc; if (vg == NULL) return; g_llvm_remove_disk(vg, cp); g_llvm_destroy(vg, 1); } static int g_llvm_activate_lv(struct g_llvm_vg *vg, struct g_llvm_lv *lv) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); KASSERT(lv->lv_sgactive == lv->lv_sgcount, ("segment missing")); gp = vg->vg_geom; pp = g_new_providerf(gp, "linux_lvm/%s-%s", vg->vg_name, lv->lv_name); pp->mediasize = vg->vg_extentsize * (off_t)lv->lv_extentcount; pp->sectorsize = vg->vg_sectorsize; g_error_provider(pp, 0); lv->lv_gprov = pp; pp->private = lv; G_LLVM_DEBUG(1, "Created %s, %juM", pp->name, pp->mediasize / (1024*1024)); return (0); } static int g_llvm_add_disk(struct g_llvm_vg *vg, struct g_provider *pp, char *uuid) { struct g_geom *gp; struct g_consumer *cp, *fcp; struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; int error; g_topology_assert(); LIST_FOREACH(pv, &vg->vg_pvs, pv_next) { if (strcmp(pv->pv_uuid, uuid) == 0) break; /* found it */ } if (pv == NULL) { G_LLVM_DEBUG(3, "uuid %s not found in pv list", uuid); return (ENOENT); } if (pv->pv_gprov != NULL) { G_LLVM_DEBUG(0, "disk %s already initialised in %s", pv->pv_name, vg->vg_name); return (EEXIST); } pv->pv_start *= vg->vg_sectorsize; gp = vg->vg_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); G_LLVM_DEBUG(1, "Attached %s to %s at offset %ju", pp->name, pv->pv_name, pv->pv_start); if (error != 0) { G_LLVM_DEBUG(0, "cannot attach %s to %s", pp->name, vg->vg_name); g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { G_LLVM_DEBUG(0, "Provider %s of %s has invalid " "sector size (%d)", pp->name, vg->vg_name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" * consumer to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } cp->private = pv; pv->pv_gcons = cp; pv->pv_gprov = pp; LIST_FOREACH(lv, &vg->vg_lvs, lv_next) { /* Find segments that map to this disk */ LIST_FOREACH(sg, &lv->lv_segs, sg_next) { if (strcmp(sg->sg_pvname, pv->pv_name) == 0) { /* avtivate the segment */ KASSERT(sg->sg_pv == NULL, ("segment already mapped")); sg->sg_pvoffset = (off_t)sg->sg_pvstart * vg->vg_extentsize + pv->pv_start; sg->sg_pv = pv; lv->lv_sgactive++; G_LLVM_DEBUG(2, "%s: %d to %d @ %s:%d" " offset %ju sector %ju", lv->lv_name, sg->sg_start, sg->sg_end, sg->sg_pvname, sg->sg_pvstart, sg->sg_pvoffset, sg->sg_pvoffset / vg->vg_sectorsize); } } /* Activate any lvs waiting on this disk */ if (lv->lv_gprov == NULL && lv->lv_sgactive == lv->lv_sgcount) { error = g_llvm_activate_lv(vg, lv); if (error) break; } } return (error); } static void g_llvm_init(struct g_class *mp) { LIST_INIT(&vg_list); } static void g_llvm_free_vg(struct g_llvm_vg *vg) { struct g_llvm_pv *pv; struct g_llvm_lv *lv; struct g_llvm_segment *sg; /* Free all the structures */ while ((pv = LIST_FIRST(&vg->vg_pvs)) != NULL) { LIST_REMOVE(pv, pv_next); free(pv, M_GLLVM); } while ((lv = LIST_FIRST(&vg->vg_lvs)) != NULL) { while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) { LIST_REMOVE(sg, sg_next); free(sg, M_GLLVM); } LIST_REMOVE(lv, lv_next); free(lv, M_GLLVM); } LIST_REMOVE(vg, vg_next); free(vg, M_GLLVM); } static void g_llvm_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_llvm_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_llvm_label ll; struct g_llvm_metadata md; struct g_llvm_vg *vg; int error; bzero(&md, sizeof(md)); g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); gp = g_new_geomf(mp, "linux_lvm:taste"); /* This orphan function should be never called. */ gp->orphan = g_llvm_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_llvm_read_label(cp, &ll); if (!error) error = g_llvm_read_md(cp, &md, &ll); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); vg = md.md_vg; if (vg->vg_geom == NULL) { /* new volume group */ gp = g_new_geomf(mp, "%s", vg->vg_name); gp->start = g_llvm_start; gp->spoiled = g_llvm_orphan; gp->orphan = g_llvm_orphan; gp->access = g_llvm_access; vg->vg_sectorsize = pp->sectorsize; vg->vg_extentsize *= vg->vg_sectorsize; vg->vg_geom = gp; gp->softc = vg; G_LLVM_DEBUG(1, "Created volume %s, extent size %zuK", vg->vg_name, vg->vg_extentsize / 1024); } /* initialise this disk in the volume group */ g_llvm_add_disk(vg, pp, ll.ll_uuid); return (vg->vg_geom); } static int g_llvm_destroy(struct g_llvm_vg *vg, int force) { struct g_provider *pp; struct g_geom *gp; g_topology_assert(); if (vg == NULL) return (ENXIO); gp = vg->vg_geom; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { G_LLVM_DEBUG(1, "Device %s is still open (r%dw%de%d)", pp->name, pp->acr, pp->acw, pp->ace); if (!force) return (EBUSY); } } g_llvm_free_vg(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_llvm_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_llvm_vg *vg; vg = gp->softc; return (g_llvm_destroy(vg, 0)); } int g_llvm_read_label(struct g_consumer *cp, struct g_llvm_label *ll) { struct g_provider *pp; u_char *buf; int i, error = 0; g_topology_assert(); /* The LVM label is stored on the first four sectors */ error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, 0, pp->sectorsize * 4, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(1, "Cannot read metadata from %s (error=%d)", pp->name, error); return (error); } /* Search the four sectors for the LVM label. */ for (i = 0; i < 4; i++) { error = llvm_label_decode(&buf[i * pp->sectorsize], ll, i); if (error == 0) break; /* found it */ } g_free(buf); return (error); } int g_llvm_read_md(struct g_consumer *cp, struct g_llvm_metadata *md, struct g_llvm_label *ll) { struct g_provider *pp; u_char *buf; int error; int size; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, ll->ll_md_offset, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(0, "Cannot read metadata from %s (error=%d)", cp->provider->name, error); return (error); } error = llvm_md_decode(buf, md, ll); g_free(buf); if (error != 0) { return (error); } G_LLVM_DEBUG(1, "reading LVM2 config @ %s:%ju", pp->name, ll->ll_md_offset + md->md_reloffset); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* round up to the nearest sector */ size = md->md_relsize + (pp->sectorsize - md->md_relsize % pp->sectorsize); buf = g_read_data(cp, ll->ll_md_offset + md->md_reloffset, size, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_LLVM_DEBUG(0, "Cannot read LVM2 config from %s (error=%d)", pp->name, error); return (error); } buf[md->md_relsize] = '\0'; G_LLVM_DEBUG(10, "LVM config:\n%s\n", buf); error = llvm_textconf_decode(buf, md->md_relsize, md); g_free(buf); return (error); } static int llvm_label_decode(const u_char *data, struct g_llvm_label *ll, int sector) { uint64_t off; char *uuid; /* Magic string */ if (bcmp("LABELONE", data , 8) != 0) return (EINVAL); /* We only support LVM2 text format */ if (bcmp("LVM2 001", data + 24, 8) != 0) { G_LLVM_DEBUG(0, "Unsupported LVM format"); return (EINVAL); } ll->ll_sector = le64dec(data + 8); ll->ll_crc = le32dec(data + 16); ll->ll_offset = le32dec(data + 20); if (ll->ll_sector != sector) { G_LLVM_DEBUG(0, "Expected sector %ju, found at %d", ll->ll_sector, sector); return (EINVAL); } off = ll->ll_offset; /* * convert the binary uuid to string format, the format is * xxxxxx-xxxx-xxxx-xxxx-xxxx-xxxx-xxxxxx (6-4-4-4-4-4-6) */ uuid = ll->ll_uuid; bcopy(data + off, uuid, 6); off += 6; uuid += 6; *uuid++ = '-'; for (int i = 0; i < 5; i++) { bcopy(data + off, uuid, 4); off += 4; uuid += 4; *uuid++ = '-'; } bcopy(data + off, uuid, 6); off += 6; uuid += 6; *uuid++ = '\0'; ll->ll_size = le64dec(data + off); off += 8; ll->ll_pestart = le64dec(data + off); off += 16; /* Only one data section is supported */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one data section supported"); return (EINVAL); } off += 16; ll->ll_md_offset = le64dec(data + off); off += 8; ll->ll_md_size = le64dec(data + off); off += 8; G_LLVM_DEBUG(1, "LVM metadata: offset=%ju, size=%ju", ll->ll_md_offset, ll->ll_md_size); /* Only one data section is supported */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one metadata section supported"); return (EINVAL); } G_LLVM_DEBUG(2, "label uuid=%s", ll->ll_uuid); G_LLVM_DEBUG(2, "sector=%ju, crc=%u, offset=%u, size=%ju, pestart=%ju", ll->ll_sector, ll->ll_crc, ll->ll_offset, ll->ll_size, ll->ll_pestart); return (0); } static int llvm_md_decode(const u_char *data, struct g_llvm_metadata *md, struct g_llvm_label *ll) { uint64_t off; char magic[16]; off = 0; md->md_csum = le32dec(data + off); off += 4; bcopy(data + off, magic, 16); off += 16; md->md_version = le32dec(data + off); off += 4; md->md_start = le64dec(data + off); off += 8; md->md_size = le64dec(data + off); off += 8; if (bcmp(G_LLVM_MAGIC, magic, 16) != 0) { G_LLVM_DEBUG(0, "Incorrect md magic number"); return (EINVAL); } if (md->md_version != 1) { G_LLVM_DEBUG(0, "Incorrect md version number (%u)", md->md_version); return (EINVAL); } if (md->md_start != ll->ll_md_offset) { G_LLVM_DEBUG(0, "Incorrect md offset (%ju)", md->md_start); return (EINVAL); } /* Aparently only one is ever returned */ md->md_reloffset = le64dec(data + off); off += 8; md->md_relsize = le64dec(data + off); off += 16; /* XXX skipped checksum */ if (le64dec(data + off) != 0) { G_LLVM_DEBUG(0, "Only one reloc supported"); return (EINVAL); } G_LLVM_DEBUG(3, "reloc: offset=%ju, size=%ju", md->md_reloffset, md->md_relsize); G_LLVM_DEBUG(3, "md: version=%u, start=%ju, size=%ju", md->md_version, md->md_start, md->md_size); return (0); } #define GRAB_INT(key, tok1, tok2, v) \ if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) { \ v = strtol(tok2, &tok1, 10); \ if (tok1 == tok2) \ /* strtol did not eat any of the buffer */ \ goto bad; \ continue; \ } #define GRAB_STR(key, tok1, tok2, v, len) \ if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) { \ strsep(&tok2, "\""); \ if (tok2 == NULL) \ continue; \ tok1 = strsep(&tok2, "\""); \ if (tok2 == NULL) \ continue; \ strncpy(v, tok1, len); \ continue; \ } #define SPLIT(key, value, str) \ key = strsep(&value, str); \ /* strip trailing whitespace on the key */ \ for (char *t = key; *t != '\0'; t++) \ if (isspace(*t)) { \ *t = '\0'; \ break; \ } static size_t llvm_grab_name(char *name, const char *tok) { size_t len; len = 0; if (tok == NULL) return (0); if (tok[0] == '-') return (0); if (strcmp(tok, ".") == 0 || strcmp(tok, "..") == 0) return (0); while (tok[len] && (isalpha(tok[len]) || isdigit(tok[len]) || tok[len] == '.' || tok[len] == '_' || tok[len] == '-' || tok[len] == '+') && len < G_LLVM_NAMELEN - 1) len++; bcopy(tok, name, len); name[len] = '\0'; return (len); } static int llvm_textconf_decode(u_char *data, int buflen, struct g_llvm_metadata *md) { struct g_llvm_vg *vg; char *buf = data; char *tok, *v; char name[G_LLVM_NAMELEN]; char uuid[G_LLVM_UUIDLEN]; size_t len; if (buf == NULL || *buf == '\0') return (EINVAL); tok = strsep(&buf, "\n"); if (tok == NULL) return (EINVAL); len = llvm_grab_name(name, tok); if (len == 0) return (EINVAL); /* check too see if the vg has already been loaded off another disk */ LIST_FOREACH(vg, &vg_list, vg_next) { if (strcmp(vg->vg_name, name) == 0) { uuid[0] = '\0'; /* grab the volume group uuid */ while ((tok = strsep(&buf, "\n")) != NULL) { if (strstr(tok, "{")) break; if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, uuid, sizeof(uuid)); } } if (strcmp(vg->vg_uuid, uuid) == 0) { /* existing vg */ md->md_vg = vg; return (0); } /* XXX different volume group with name clash! */ G_LLVM_DEBUG(0, "%s already exists, volume group not loaded", name); return (EINVAL); } } vg = malloc(sizeof(*vg), M_GLLVM, M_NOWAIT|M_ZERO); if (vg == NULL) return (ENOMEM); strncpy(vg->vg_name, name, sizeof(vg->vg_name)); LIST_INIT(&vg->vg_pvs); LIST_INIT(&vg->vg_lvs); #define VOL_FOREACH(func, tok, buf, p) \ while ((tok = strsep(buf, "\n")) != NULL) { \ if (strstr(tok, "{")) { \ func(buf, tok, p); \ continue; \ } \ if (strstr(tok, "}")) \ break; \ } while ((tok = strsep(&buf, "\n")) != NULL) { if (strcmp(tok, "physical_volumes {") == 0) { VOL_FOREACH(llvm_textconf_decode_pv, tok, &buf, vg); continue; } if (strcmp(tok, "logical_volumes {") == 0) { VOL_FOREACH(llvm_textconf_decode_lv, tok, &buf, vg); continue; } if (strstr(tok, "{")) { G_LLVM_DEBUG(2, "unknown section %s", tok); continue; } /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, vg->vg_uuid, sizeof(vg->vg_uuid)); GRAB_INT("extent_size", v, tok, vg->vg_extentsize); continue; } } /* basic checking */ if (vg->vg_extentsize == 0) goto bad; md->md_vg = vg; LIST_INSERT_HEAD(&vg_list, vg, vg_next); G_LLVM_DEBUG(3, "vg: name=%s uuid=%s", vg->vg_name, vg->vg_uuid); return(0); bad: g_llvm_free_vg(vg); return (-1); } #undef VOL_FOREACH static int llvm_textconf_decode_pv(char **buf, char *tok, struct g_llvm_vg *vg) { struct g_llvm_pv *pv; char *v; size_t len; if (*buf == NULL || **buf == '\0') return (EINVAL); pv = malloc(sizeof(*pv), M_GLLVM, M_NOWAIT|M_ZERO); if (pv == NULL) return (ENOMEM); pv->pv_vg = vg; len = 0; if (tok == NULL) goto bad; len = llvm_grab_name(pv->pv_name, tok); if (len == 0) goto bad; while ((tok = strsep(buf, "\n")) != NULL) { if (strstr(tok, "{")) goto bad; if (strstr(tok, "}")) break; /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, pv->pv_uuid, sizeof(pv->pv_uuid)); GRAB_INT("pe_start", v, tok, pv->pv_start); GRAB_INT("pe_count", v, tok, pv->pv_count); continue; } } if (tok == NULL) goto bad; /* basic checking */ if (pv->pv_count == 0) goto bad; LIST_INSERT_HEAD(&vg->vg_pvs, pv, pv_next); G_LLVM_DEBUG(3, "pv: name=%s uuid=%s", pv->pv_name, pv->pv_uuid); return (0); bad: free(pv, M_GLLVM); return (-1); } static int llvm_textconf_decode_lv(char **buf, char *tok, struct g_llvm_vg *vg) { struct g_llvm_lv *lv; struct g_llvm_segment *sg; char *v; size_t len; if (*buf == NULL || **buf == '\0') return (EINVAL); lv = malloc(sizeof(*lv), M_GLLVM, M_NOWAIT|M_ZERO); if (lv == NULL) return (ENOMEM); lv->lv_vg = vg; LIST_INIT(&lv->lv_segs); if (tok == NULL) goto bad; len = llvm_grab_name(lv->lv_name, tok); if (len == 0) goto bad; while ((tok = strsep(buf, "\n")) != NULL) { if (strstr(tok, "{")) { if (strstr(tok, "segment")) { llvm_textconf_decode_sg(buf, tok, lv); continue; } else /* unexpected section */ goto bad; } if (strstr(tok, "}")) break; /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_STR("id", v, tok, lv->lv_uuid, sizeof(lv->lv_uuid)); GRAB_INT("segment_count", v, tok, lv->lv_sgcount); continue; } } if (tok == NULL) goto bad; if (lv->lv_sgcount == 0 || lv->lv_sgcount != lv->lv_numsegs) /* zero or incomplete segment list */ goto bad; /* Optimize for only one segment on the pv */ lv->lv_firstsg = LIST_FIRST(&lv->lv_segs); LIST_INSERT_HEAD(&vg->vg_lvs, lv, lv_next); G_LLVM_DEBUG(3, "lv: name=%s uuid=%s", lv->lv_name, lv->lv_uuid); return (0); bad: while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) { LIST_REMOVE(sg, sg_next); free(sg, M_GLLVM); } free(lv, M_GLLVM); return (-1); } static int llvm_textconf_decode_sg(char **buf, char *tok, struct g_llvm_lv *lv) { struct g_llvm_segment *sg; char *v; int count = 0; if (*buf == NULL || **buf == '\0') return (EINVAL); sg = malloc(sizeof(*sg), M_GLLVM, M_NOWAIT|M_ZERO); if (sg == NULL) return (ENOMEM); while ((tok = strsep(buf, "\n")) != NULL) { /* only a single linear stripe is supported */ if (strstr(tok, "stripe_count")) { SPLIT(v, tok, "="); GRAB_INT("stripe_count", v, tok, count); if (count != 1) goto bad; } if (strstr(tok, "{")) goto bad; if (strstr(tok, "}")) break; if (strcmp(tok, "stripes = [") == 0) { tok = strsep(buf, "\n"); if (tok == NULL) goto bad; strsep(&tok, "\""); if (tok == NULL) goto bad; /* missing open quotes */ v = strsep(&tok, "\""); if (tok == NULL) goto bad; /* missing close quotes */ strncpy(sg->sg_pvname, v, sizeof(sg->sg_pvname)); if (*tok != ',') goto bad; /* missing comma for stripe */ tok++; sg->sg_pvstart = strtol(tok, &v, 10); if (v == tok) /* strtol did not eat any of the buffer */ goto bad; continue; } /* parse 'key = value' lines */ if (strstr(tok, "=")) { SPLIT(v, tok, "="); GRAB_INT("start_extent", v, tok, sg->sg_start); GRAB_INT("extent_count", v, tok, sg->sg_count); continue; } } if (tok == NULL) goto bad; /* basic checking */ if (count != 1 || sg->sg_count == 0) goto bad; sg->sg_end = sg->sg_start + sg->sg_count - 1; lv->lv_numsegs++; lv->lv_extentcount += sg->sg_count; LIST_INSERT_HEAD(&lv->lv_segs, sg, sg_next); return (0); bad: free(sg, M_GLLVM); return (-1); } #undef GRAB_INT #undef GRAB_STR #undef SPLIT static struct g_class g_llvm_class = { .name = G_LLVM_CLASS_NAME, .version = G_VERSION, .init = g_llvm_init, .taste = g_llvm_taste, .destroy_geom = g_llvm_destroy_geom }; DECLARE_GEOM_CLASS(g_llvm_class, g_linux_lvm); Index: head/sys/geom/linux_lvm/g_linux_lvm.h =================================================================== --- head/sys/geom/linux_lvm/g_linux_lvm.h (revision 326269) +++ head/sys/geom/linux_lvm/g_linux_lvm.h (revision 326270) @@ -1,113 +1,115 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Andrew Thompson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define G_LLVM_DEBUG(lvl, ...) do { \ if (g_llvm_debug >= (lvl)) { \ printf("GEOM_LINUX_LVM"); \ if (g_llvm_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_LLVM_CLASS_NAME "LINUX_LVM" #define G_LLVM_NAMELEN 128 #define G_LLVM_UUIDLEN 40 #define G_LLVM_MAGIC "\040\114\126\115\062\040\170\133" \ "\065\101\045\162\060\116\052\076" struct g_llvm_label { uint64_t ll_sector; uint32_t ll_crc; uint32_t ll_offset; char ll_uuid[G_LLVM_UUIDLEN]; uint64_t ll_size; uint64_t ll_pestart; uint64_t ll_md_offset; uint64_t ll_md_size; }; struct g_llvm_metadata { uint32_t md_csum; uint32_t md_version; uint64_t md_start; uint64_t md_size; uint64_t md_reloffset; uint64_t md_relsize; struct g_llvm_vg *md_vg; }; struct g_llvm_lv { LIST_ENTRY(g_llvm_lv) lv_next; struct g_llvm_vg *lv_vg; char lv_name[G_LLVM_NAMELEN]; char lv_uuid[G_LLVM_UUIDLEN]; int lv_sgcount; int lv_sgactive; struct g_provider *lv_gprov; int lv_extentcount; LIST_HEAD(, g_llvm_segment) lv_segs; int lv_numsegs; struct g_llvm_segment *lv_firstsg; }; struct g_llvm_pv { LIST_ENTRY(g_llvm_pv) pv_next; struct g_llvm_vg *pv_vg; char pv_name[G_LLVM_NAMELEN]; char pv_uuid[G_LLVM_UUIDLEN]; size_t pv_size; off_t pv_start; int pv_count; struct g_provider *pv_gprov; struct g_consumer *pv_gcons; }; struct g_llvm_segment { LIST_ENTRY(g_llvm_segment) sg_next; int sg_start; int sg_end; int sg_count; char sg_pvname[G_LLVM_NAMELEN]; struct g_llvm_pv *sg_pv; int sg_pvstart; off_t sg_pvoffset; }; struct g_llvm_vg { LIST_ENTRY(g_llvm_vg) vg_next; char vg_name[G_LLVM_NAMELEN]; char vg_uuid[G_LLVM_UUIDLEN]; size_t vg_extentsize; int vg_sectorsize; struct g_geom *vg_geom; LIST_HEAD(, g_llvm_pv) vg_pvs; LIST_HEAD(, g_llvm_lv) vg_lvs; }; Index: head/sys/geom/mirror/g_mirror.c =================================================================== --- head/sys/geom/mirror/g_mirror.c (revision 326269) +++ head/sys/geom/mirror/g_mirror.c (revision 326270) @@ -1,3437 +1,3439 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0, "GEOM_MIRROR stuff"); int g_mirror_debug = 0; SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static g_ctl_destroy_geom_t g_mirror_destroy_geom; static g_taste_t g_mirror_taste; static g_init_t g_mirror_init; static g_fini_t g_mirror_fini; static g_provgone_t g_mirror_providergone; static g_resize_t g_mirror_resize; struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini, .providergone = g_mirror_providergone, .resize = g_mirror_resize }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, bool force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_mirror_event * g_mirror_event_get(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; sx_assert(&sc->sc_lock, SX_LOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_REMOVE(disk, d_next); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_free_device(struct g_mirror_softc *sc) { mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_done_mtx); sx_destroy(&sc->sc_lock); free(sc, M_MIRROR); } static void g_mirror_providergone(struct g_provider *pp) { struct g_mirror_softc *sc = pp->private; if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_get(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); sx_xunlock(&sc->sc_lock); if ((--sc->sc_refcnt) == 0) g_mirror_free_device(sc); g_topology_unlock(); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) { /* * Handle the case, when the size of parent provider reduced. */ if (offset < md->md_mediasize) error = ENOSPC; else mirror_metadata_encode(md, sector); } KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error); if (error == 0) error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); if (disk->d_softc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return (0); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) return; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_flush_done(struct bio *bp) { struct g_mirror_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->private; mtx_lock(&sc->sc_done_mtx); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_done_mtx); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_done_mtx); g_destroy_bio(bp); } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = bp->bio_parent; sc = pbp->bio_to->private; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } if (bp->bio_cmd == BIO_READ) KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read, bp->bio_error); else if (bp->bio_cmd == BIO_WRITE) KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write, bp->bio_error); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { if (bp->bio_error == ENXIO && bp->bio_cmd == BIO_READ) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; else if (bp->bio_error == ENXIO) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID_NOW; else sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_candelete(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int *val; sc = bp->bio_to->private; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) break; } val = (int *)bp->bio_data; *val = (disk != NULL); g_io_deliver(bp, 0); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. * If the component with the highest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->private; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_flush_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->private; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_mirror_flush(sc, bp); return; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) { g_mirror_candelete(bp); return; } else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); if (bp->bio_to->error != 0) { mtx_unlock(&sc->sc_queue_mtx); g_io_deliver(bp, bp->bio_to->error); return; } bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; u_int i; if (sc->sc_sync.ds_ndisks == 0) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (1); } } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (0); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_mirror_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Free a synchronization request and clear its slot in the array. */ static void g_mirror_sync_request_free(struct g_mirror_disk *disk, struct bio *bp) { int idx; if (disk != NULL && disk->d_sync.ds_bios != NULL) { idx = (int)(uintptr_t)bp->bio_caller1; KASSERT(disk->d_sync.ds_bios[idx] == bp, ("unexpected sync BIO at %p:%d", disk, idx)); disk->d_sync.ds_bios[idx] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_mirror_sync_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); g_mirror_sync_request_free(NULL, bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_mirror_sync_request_free(disk, bp); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { off_t offset; void *data; int i, idx; KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write, bp->bio_error); if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_mirror_sync_request_free(disk, bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset >= sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; g_mirror_sync_request_free(disk, bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; idx = (int)(uintptr_t)bp->bio_caller1; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)idx; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp != NULL && bp->bio_offset < offset) offset = bp->bio_offset; } if (sync->ds_offset_done + (MAXPHYS * 100) < offset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = offset; g_mirror_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->private; switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { sync = &disk->d_sync; switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= sync->ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } if (bioq_first(&queue) == NULL) { g_io_deliver(bp, EOPNOTSUPP); return; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, bp); return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DRAIN) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_get(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, true); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, false); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_takefirst(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); if (bioq_first(&sc->sc_queue) != NULL) { mtx_unlock(&sc->sc_queue_mtx); continue; } } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { g_mirror_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) g_mirror_regular_request(bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) g_mirror_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { g_mirror_register_request(bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[i] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)i; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = disk->d_sync.ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } g_mirror_regular_release(sc); free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_RECEIVE; pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } pp->private = sc; sc->sc_refcnt++; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) { /* * Abort any pending I/O that wasn't generated by us. * Synchronization requests and requests destined for individual * mirror components can be destroyed immediately. */ if (bp->bio_to == sc->sc_provider && bp->bio_from->geom != sc->sc_sync.ds_geom) { g_io_deliver(bp, ENXIO); } else { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); } } mtx_unlock(&sc->sc_queue_mtx); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name); g_topology_unlock(); } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0 && (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0)) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, bool force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; u_int dirty, ndisks, genid, syncid; bool broken; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * if we have any disks and 'force' is true. */ ndisks = g_mirror_ndisks(sc, -1); if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Find the biggest genid. */ genid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ broken = false; LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_genid < genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", g_mirror_get_diskname(disk), sc->sc_name); g_mirror_destroy_disk(disk); /* * Bump the syncid in case we discover a healthy * replacement disk after starting the mirror. */ broken = true; } } /* * Find the biggest syncid. */ syncid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid > syncid) syncid = disk->d_sync.ds_syncid; } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; sc->sc_syncid = syncid; if (force || broken) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No usable disks, so destroy the device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID_NOW) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID_NOW; g_mirror_bump_syncid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) { G_MIRROR_DEBUG(0, "Device %s: failed to clear metadata on %s: %d.", sc->sc_name, g_mirror_get_diskname(disk), error); break; } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if (md->md_slice != sc->sc_slice) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_slice", pp->name, sc->sc_name); return (EINVAL); } if (md->md_balance != sc->sc_balance) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_balance", pp->name, sc->sc_name); return (EINVAL); } #if 0 if (md->md_mediasize != sc->sc_mediasize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } #endif if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING && md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0, ("CLOSEWAIT flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } sc->sc_provider_open += acr + acw + ace; if (pp->acw + acw == 0) g_mirror_idle(sc, 0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_CLOSEWAIT) != 0 && sc->sc_provider_open == 0) g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_type = type; sc->sc_id = md->md_mid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sc->sc_refcnt = 1; sx_init(&sc->sc_lock, "gmirror:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; sc->sc_provider_open = 0; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); g_destroy_geom(sc->sc_geom); g_mirror_free_device(sc); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider_open != 0) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (%d).", sc->sc_name, sc->sc_provider_open); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_CLOSEWAIT; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", sc->sc_name); } } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { sx_xunlock(&sc->sc_lock); return (0); } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DRAIN; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_AUTOMATIC); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static void g_mirror_resize(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name); disk = cp->private; if (disk == NULL) return; g_topology_unlock(); g_mirror_update_metadata(disk); g_topology_lock(); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_provider->mediasize)); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_MIRROR_TYPE_AUTOMATIC: sbuf_printf(sb, "AUTOMATIC"); break; case G_MIRROR_TYPE_MANUAL: sbuf_printf(sb, "MANUAL"); break; default: sbuf_printf(sb, "UNKNOWN"); break; } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; if (panicstr != NULL) return; mp = arg; g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); Index: head/sys/geom/mirror/g_mirror.h =================================================================== --- head/sys/geom/mirror/g_mirror.h (revision 326269) +++ head/sys/geom/mirror/g_mirror.h (revision 326270) @@ -1,510 +1,512 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_MIRROR_H_ #define _G_MIRROR_H_ #include #include #define G_MIRROR_CLASS_NAME "MIRROR" #define G_MIRROR_MAGIC "GEOM::MIRROR" /* * Version history: * 0 - Initial version number. * 1 - Added 'prefer' balance algorithm. * 2 - Added md_genid field to metadata. * 3 - Added md_provsize field to metadata. * 4 - Added 'no failure synchronization' flag. */ #define G_MIRROR_VERSION 4 #define G_MIRROR_BALANCE_NONE 0 #define G_MIRROR_BALANCE_ROUND_ROBIN 1 #define G_MIRROR_BALANCE_LOAD 2 #define G_MIRROR_BALANCE_SPLIT 3 #define G_MIRROR_BALANCE_PREFER 4 #define G_MIRROR_BALANCE_MIN G_MIRROR_BALANCE_NONE #define G_MIRROR_BALANCE_MAX G_MIRROR_BALANCE_PREFER #define G_MIRROR_DISK_FLAG_DIRTY 0x0000000000000001ULL #define G_MIRROR_DISK_FLAG_SYNCHRONIZING 0x0000000000000002ULL #define G_MIRROR_DISK_FLAG_FORCE_SYNC 0x0000000000000004ULL #define G_MIRROR_DISK_FLAG_INACTIVE 0x0000000000000008ULL #define G_MIRROR_DISK_FLAG_HARDCODED 0x0000000000000010ULL #define G_MIRROR_DISK_FLAG_BROKEN 0x0000000000000020ULL #define G_MIRROR_DISK_FLAG_CANDELETE 0x0000000000000040ULL #define G_MIRROR_DISK_FLAG_MASK (G_MIRROR_DISK_FLAG_DIRTY | \ G_MIRROR_DISK_FLAG_SYNCHRONIZING | \ G_MIRROR_DISK_FLAG_FORCE_SYNC | \ G_MIRROR_DISK_FLAG_INACTIVE | \ G_MIRROR_DISK_FLAG_CANDELETE) #define G_MIRROR_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_MIRROR_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL #define G_MIRROR_DEVICE_FLAG_MASK (G_MIRROR_DEVICE_FLAG_NOAUTOSYNC | \ G_MIRROR_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL extern int g_mirror_debug; #define G_MIRROR_DEBUG(lvl, ...) do { \ if (g_mirror_debug >= (lvl)) { \ printf("GEOM_MIRROR"); \ if (g_mirror_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_MIRROR_LOGREQ(lvl, bp, ...) do { \ if (g_mirror_debug >= (lvl)) { \ printf("GEOM_MIRROR"); \ if (g_mirror_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #define G_MIRROR_BIO_FLAG_REGULAR 0x01 #define G_MIRROR_BIO_FLAG_SYNC 0x02 /* * Informations needed for synchronization. */ struct g_mirror_disk_sync { struct g_consumer *ds_consumer; /* Consumer connected to our mirror. */ off_t ds_offset; /* Offset of next request to send. */ off_t ds_offset_done; /* Offset of already synchronized region. */ u_int ds_syncid; /* Disk's synchronization ID. */ u_int ds_inflight; /* Number of in-flight sync requests. */ struct bio **ds_bios; /* BIOs for synchronization I/O. */ }; /* * Informations needed for synchronization. */ struct g_mirror_device_sync { struct g_geom *ds_geom; /* Synchronization geom. */ u_int ds_ndisks; /* Number of disks in SYNCHRONIZING state. */ }; #define G_MIRROR_DISK_STATE_NONE 0 #define G_MIRROR_DISK_STATE_NEW 1 #define G_MIRROR_DISK_STATE_ACTIVE 2 #define G_MIRROR_DISK_STATE_STALE 3 #define G_MIRROR_DISK_STATE_SYNCHRONIZING 4 #define G_MIRROR_DISK_STATE_DISCONNECTED 5 #define G_MIRROR_DISK_STATE_DESTROY 6 struct g_mirror_disk { uint32_t d_id; /* Disk ID. */ struct g_consumer *d_consumer; /* Consumer. */ struct g_mirror_softc *d_softc; /* Back-pointer to softc. */ int d_state; /* Disk state. */ u_int d_priority; /* Disk priority. */ u_int load; /* Averaged queue length */ off_t d_last_offset; /* Last read offset */ uint64_t d_flags; /* Additional flags. */ u_int d_genid; /* Disk's generation ID. */ struct g_mirror_disk_sync d_sync;/* Sync information. */ LIST_ENTRY(g_mirror_disk) d_next; }; #define d_name d_consumer->provider->name #define G_MIRROR_EVENT_DONTWAIT 0x1 #define G_MIRROR_EVENT_WAIT 0x2 #define G_MIRROR_EVENT_DEVICE 0x4 #define G_MIRROR_EVENT_DONE 0x8 struct g_mirror_event { struct g_mirror_disk *e_disk; int e_state; int e_flags; int e_error; TAILQ_ENTRY(g_mirror_event) e_next; }; #define G_MIRROR_DEVICE_FLAG_DESTROY 0x0100000000000000ULL #define G_MIRROR_DEVICE_FLAG_DRAIN 0x0200000000000000ULL #define G_MIRROR_DEVICE_FLAG_CLOSEWAIT 0x0400000000000000ULL #define G_MIRROR_DEVICE_FLAG_TASTING 0x0800000000000000ULL #define G_MIRROR_DEVICE_FLAG_WIPE 0x1000000000000000ULL #define G_MIRROR_DEVICE_STATE_STARTING 0 #define G_MIRROR_DEVICE_STATE_RUNNING 1 #define G_MIRROR_TYPE_MANUAL 0 #define G_MIRROR_TYPE_AUTOMATIC 1 /* Bump syncid on first write. */ #define G_MIRROR_BUMP_SYNCID 0x1 /* Bump genid immediately. */ #define G_MIRROR_BUMP_GENID 0x2 /* Bump syncid immediately. */ #define G_MIRROR_BUMP_SYNCID_NOW 0x4 struct g_mirror_softc { u_int sc_type; /* Device type (manual/automatic). */ u_int sc_state; /* Device state. */ uint32_t sc_slice; /* Slice size. */ uint8_t sc_balance; /* Balance algorithm. */ uint64_t sc_mediasize; /* Device size. */ uint32_t sc_sectorsize; /* Sector size. */ uint64_t sc_flags; /* Additional flags. */ struct g_geom *sc_geom; struct g_provider *sc_provider; int sc_provider_open; uint32_t sc_id; /* Mirror unique ID. */ struct sx sc_lock; struct bio_queue_head sc_queue; struct mtx sc_queue_mtx; struct proc *sc_worker; struct bio_queue_head sc_regular_delayed; /* Delayed I/O requests due collision with sync requests. */ struct bio_queue_head sc_inflight; /* In-flight regular write requests. */ struct bio_queue_head sc_sync_delayed; /* Delayed sync requests due collision with regular requests. */ LIST_HEAD(, g_mirror_disk) sc_disks; u_int sc_ndisks; /* Number of disks. */ struct g_mirror_disk *sc_hint; u_int sc_genid; /* Generation ID. */ u_int sc_syncid; /* Synchronization ID. */ int sc_bump_id; struct g_mirror_device_sync sc_sync; int sc_idle; /* DIRTY flags removed. */ time_t sc_last_write; u_int sc_writes; u_int sc_refcnt; /* Number of softc references */ TAILQ_HEAD(, g_mirror_event) sc_events; struct mtx sc_events_mtx; struct callout sc_callout; struct root_hold_token *sc_rootmount; struct mtx sc_done_mtx; }; #define sc_name sc_geom->name struct g_mirror_metadata; u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state); struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type); #define G_MIRROR_DESTROY_SOFT 0 #define G_MIRROR_DESTROY_DELAYED 1 #define G_MIRROR_DESTROY_HARD 2 int g_mirror_destroy(struct g_mirror_softc *sc, int how); int g_mirror_event_send(void *arg, int state, int flags); struct g_mirror_metadata; int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md); int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md); void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md); void g_mirror_update_metadata(struct g_mirror_disk *disk); g_ctl_req_t g_mirror_config; #endif /* _KERNEL */ struct g_mirror_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Mirror name. */ uint32_t md_mid; /* Mirror unique ID. */ uint32_t md_did; /* Disk unique ID. */ uint8_t md_all; /* Number of disks in mirror. */ uint32_t md_genid; /* Generation ID. */ uint32_t md_syncid; /* Synchronization ID. */ uint8_t md_priority; /* Disk priority. */ uint32_t md_slice; /* Slice size. */ uint8_t md_balance; /* Balance type. */ uint64_t md_mediasize; /* Size of the smallest disk in mirror. */ uint32_t md_sectorsize; /* Sector size. */ uint64_t md_sync_offset; /* Synchronized offset. */ uint64_t md_mflags; /* Additional mirror flags. */ uint64_t md_dflags; /* Additional disk flags. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ u_char md_hash[16]; /* MD5 hash. */ }; static __inline void mirror_metadata_encode(struct g_mirror_metadata *md, u_char *data) { MD5_CTX ctx; bcopy(md->md_magic, data, 16); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, 16); le32enc(data + 36, md->md_mid); le32enc(data + 40, md->md_did); *(data + 44) = md->md_all; le32enc(data + 45, md->md_genid); le32enc(data + 49, md->md_syncid); *(data + 53) = md->md_priority; le32enc(data + 54, md->md_slice); *(data + 58) = md->md_balance; le64enc(data + 59, md->md_mediasize); le32enc(data + 67, md->md_sectorsize); le64enc(data + 71, md->md_sync_offset); le64enc(data + 79, md->md_mflags); le64enc(data + 87, md->md_dflags); bcopy(md->md_provider, data + 95, 16); le64enc(data + 111, md->md_provsize); MD5Init(&ctx); MD5Update(&ctx, data, 119); MD5Final(md->md_hash, &ctx); bcopy(md->md_hash, data + 119, 16); } static __inline int mirror_metadata_decode_v0v1(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_syncid = le32dec(data + 45); md->md_priority = *(data + 49); md->md_slice = le32dec(data + 50); md->md_balance = *(data + 54); md->md_mediasize = le64dec(data + 55); md->md_sectorsize = le32dec(data + 63); md->md_sync_offset = le64dec(data + 67); md->md_mflags = le64dec(data + 75); md->md_dflags = le64dec(data + 83); bcopy(data + 91, md->md_provider, 16); bcopy(data + 107, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 107); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 107, 16) != 0) return (EINVAL); /* New fields. */ md->md_genid = 0; md->md_provsize = 0; return (0); } static __inline int mirror_metadata_decode_v2(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_genid = le32dec(data + 45); md->md_syncid = le32dec(data + 49); md->md_priority = *(data + 53); md->md_slice = le32dec(data + 54); md->md_balance = *(data + 58); md->md_mediasize = le64dec(data + 59); md->md_sectorsize = le32dec(data + 67); md->md_sync_offset = le64dec(data + 71); md->md_mflags = le64dec(data + 79); md->md_dflags = le64dec(data + 87); bcopy(data + 95, md->md_provider, 16); bcopy(data + 111, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 111); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 111, 16) != 0) return (EINVAL); /* New fields. */ md->md_provsize = 0; return (0); } static __inline int mirror_metadata_decode_v3v4(const u_char *data, struct g_mirror_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_mid = le32dec(data + 36); md->md_did = le32dec(data + 40); md->md_all = *(data + 44); md->md_genid = le32dec(data + 45); md->md_syncid = le32dec(data + 49); md->md_priority = *(data + 53); md->md_slice = le32dec(data + 54); md->md_balance = *(data + 58); md->md_mediasize = le64dec(data + 59); md->md_sectorsize = le32dec(data + 67); md->md_sync_offset = le64dec(data + 71); md->md_mflags = le64dec(data + 79); md->md_dflags = le64dec(data + 87); bcopy(data + 95, md->md_provider, 16); md->md_provsize = le64dec(data + 111); bcopy(data + 119, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 119); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 119, 16) != 0) return (EINVAL); return (0); } static __inline int mirror_metadata_decode(const u_char *data, struct g_mirror_metadata *md) { int error; bcopy(data, md->md_magic, 16); md->md_version = le32dec(data + 16); switch (md->md_version) { case 0: case 1: error = mirror_metadata_decode_v0v1(data, md); break; case 2: error = mirror_metadata_decode_v2(data, md); break; case 3: case 4: error = mirror_metadata_decode_v3v4(data, md); break; default: error = EINVAL; break; } return (error); } static __inline const char * balance_name(u_int balance) { static const char *algorithms[] = { [G_MIRROR_BALANCE_NONE] = "none", [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", [G_MIRROR_BALANCE_LOAD] = "load", [G_MIRROR_BALANCE_SPLIT] = "split", [G_MIRROR_BALANCE_PREFER] = "prefer", [G_MIRROR_BALANCE_MAX + 1] = "unknown" }; if (balance > G_MIRROR_BALANCE_MAX) balance = G_MIRROR_BALANCE_MAX + 1; return (algorithms[balance]); } static __inline int balance_id(const char *name) { static const char *algorithms[] = { [G_MIRROR_BALANCE_NONE] = "none", [G_MIRROR_BALANCE_ROUND_ROBIN] = "round-robin", [G_MIRROR_BALANCE_LOAD] = "load", [G_MIRROR_BALANCE_SPLIT] = "split", [G_MIRROR_BALANCE_PREFER] = "prefer" }; int n; for (n = G_MIRROR_BALANCE_MIN; n <= G_MIRROR_BALANCE_MAX; n++) { if (strcmp(name, algorithms[n]) == 0) return (n); } return (-1); } static __inline void mirror_metadata_dump(const struct g_mirror_metadata *md) { static const char hex[] = "0123456789abcdef"; char hash[16 * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" name: %s\n", md->md_name); printf(" mid: %u\n", (u_int)md->md_mid); printf(" did: %u\n", (u_int)md->md_did); printf(" all: %u\n", (u_int)md->md_all); printf(" genid: %u\n", (u_int)md->md_genid); printf(" syncid: %u\n", (u_int)md->md_syncid); printf(" priority: %u\n", (u_int)md->md_priority); printf(" slice: %u\n", (u_int)md->md_slice); printf(" balance: %s\n", balance_name((u_int)md->md_balance)); printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize); printf("sectorsize: %u\n", (u_int)md->md_sectorsize); printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset); printf(" mflags:"); if (md->md_mflags == 0) printf(" NONE"); else { if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) printf(" NOFAILSYNC"); if ((md->md_mflags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) printf(" NOAUTOSYNC"); } printf("\n"); printf(" dflags:"); if (md->md_dflags == 0) printf(" NONE"); else { if ((md->md_dflags & G_MIRROR_DISK_FLAG_DIRTY) != 0) printf(" DIRTY"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) printf(" SYNCHRONIZING"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) printf(" FORCE_SYNC"); if ((md->md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) printf(" INACTIVE"); } printf("\n"); printf("hcprovider: %s\n", md->md_provider); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); bzero(hash, sizeof(hash)); for (i = 0; i < 16; i++) { hash[i * 2] = hex[md->md_hash[i] >> 4]; hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", hash); } #endif /* !_G_MIRROR_H_ */ Index: head/sys/geom/mirror/g_mirror_ctl.c =================================================================== --- head/sys/geom/mirror/g_mirror_ctl.c (revision 326269) +++ head/sys/geom/mirror/g_mirror_ctl.c (revision 326270) @@ -1,1095 +1,1097 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2009 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct g_mirror_softc * g_mirror_find_device(struct g_class *mp, const char *name) { struct g_mirror_softc *sc; struct g_geom *gp; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) continue; if (strcmp(gp->name, name) == 0 || strcmp(sc->sc_name, name) == 0) { g_topology_unlock(); sx_xlock(&sc->sc_lock); return (sc); } } g_topology_unlock(); return (NULL); } static struct g_mirror_disk * g_mirror_find_disk(struct g_mirror_softc *sc, const char *name) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); if (strncmp(name, "/dev/", 5) == 0) name += 5; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer == NULL) continue; if (disk->d_consumer->provider == NULL) continue; if (strcmp(disk->d_consumer->provider->name, name) == 0) return (disk); } return (NULL); } static void g_mirror_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name, *balancep, *prov; intmax_t *slicep, *priority; uint32_t slice; uint8_t balance; int *autosync, *noautosync, *failsync, *nofailsync, *hardcode, *dynamic; int *nargs, do_sync = 0, dirty = 1, do_priority = 0; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1 && *nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } balancep = gctl_get_asciiparam(req, "balance"); if (balancep == NULL) { gctl_error(req, "No '%s' argument.", "balance"); return; } autosync = gctl_get_paraml(req, "autosync", sizeof(*autosync)); if (autosync == NULL) { gctl_error(req, "No '%s' argument.", "autosync"); return; } noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync)); if (noautosync == NULL) { gctl_error(req, "No '%s' argument.", "noautosync"); return; } failsync = gctl_get_paraml(req, "failsync", sizeof(*failsync)); if (failsync == NULL) { gctl_error(req, "No '%s' argument.", "failsync"); return; } nofailsync = gctl_get_paraml(req, "nofailsync", sizeof(*nofailsync)); if (nofailsync == NULL) { gctl_error(req, "No '%s' argument.", "nofailsync"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "No '%s' argument.", "hardcode"); return; } dynamic = gctl_get_paraml(req, "dynamic", sizeof(*dynamic)); if (dynamic == NULL) { gctl_error(req, "No '%s' argument.", "dynamic"); return; } priority = gctl_get_paraml(req, "priority", sizeof(*priority)); if (priority == NULL) { gctl_error(req, "No '%s' argument.", "priority"); return; } if (*priority < -1 || *priority > 255) { gctl_error(req, "Priority range is 0 to 255, %jd given", *priority); return; } /* * Since we have a priority, we also need a provider now. * Note: be WARNS safe, by always assigning prov and only throw an * error if *priority != -1. */ prov = gctl_get_asciiparam(req, "arg1"); if (*priority > -1) { if (prov == NULL) { gctl_error(req, "Priority needs a disk name"); return; } do_priority = 1; } if (*autosync && *noautosync) { gctl_error(req, "'%s' and '%s' specified.", "autosync", "noautosync"); return; } if (*failsync && *nofailsync) { gctl_error(req, "'%s' and '%s' specified.", "failsync", "nofailsync"); return; } if (*hardcode && *dynamic) { gctl_error(req, "'%s' and '%s' specified.", "hardcode", "dynamic"); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (*balancep == '\0') balance = sc->sc_balance; else { if (balance_id(balancep) == -1) { gctl_error(req, "Invalid balance algorithm."); sx_xunlock(&sc->sc_lock); return; } balance = balance_id(balancep); } slicep = gctl_get_paraml(req, "slice", sizeof(*slicep)); if (slicep == NULL) { gctl_error(req, "No '%s' argument.", "slice"); sx_xunlock(&sc->sc_lock); return; } if (*slicep == -1) slice = sc->sc_slice; else slice = *slicep; /* Enforce usage() of -p not allowing any other options. */ if (do_priority && (*autosync || *noautosync || *failsync || *nofailsync || *hardcode || *dynamic || *slicep != -1 || *balancep != '\0')) { sx_xunlock(&sc->sc_lock); gctl_error(req, "only -p accepted when setting priority"); return; } if (sc->sc_balance == balance && sc->sc_slice == slice && !*autosync && !*noautosync && !*failsync && !*nofailsync && !*hardcode && !*dynamic && !do_priority) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Nothing has changed."); return; } if ((!do_priority && *nargs != 1) || (do_priority && *nargs != 2)) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Invalid number of arguments."); return; } if (g_mirror_ndisks(sc, -1) < sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Not all disks connected. Try 'forget' command " "first."); return; } sc->sc_balance = balance; sc->sc_slice = slice; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) { if (*autosync) { sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_NOAUTOSYNC; do_sync = 1; } } else { if (*noautosync) sc->sc_flags |= G_MIRROR_DEVICE_FLAG_NOAUTOSYNC; } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) { if (*failsync) sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_NOFAILSYNC; } else { if (*nofailsync) { sc->sc_flags |= G_MIRROR_DEVICE_FLAG_NOFAILSYNC; dirty = 0; } } LIST_FOREACH(disk, &sc->sc_disks, d_next) { /* * Handle priority first, since we only need one disk, do one * operation on it and then we're done. No need to check other * flags, as usage doesn't allow it. */ if (do_priority) { if (strcmp(disk->d_name, prov) == 0) { if (disk->d_priority == *priority) gctl_error(req, "Nothing has changed."); else { disk->d_priority = *priority; g_mirror_update_metadata(disk); } break; } continue; } if (do_sync) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; } if (*hardcode) disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; else if (*dynamic) disk->d_flags &= ~G_MIRROR_DISK_FLAG_HARDCODED; if (!dirty) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); if (do_sync) { if (disk->d_state == G_MIRROR_DISK_STATE_STALE) { g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } } sx_xunlock(&sc->sc_lock); } static void g_mirror_create_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while creating %s.", __func__, cp->provider->name)); } static void g_mirror_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_mirror_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct g_mirror_softc *sc; struct sbuf *sb; const char *name; char param[16]; int *nargs; intmax_t *val; int *ival; const char *sval; int bal; unsigned attached, no, sectorsize; off_t mediasize; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_MIRROR_MAGIC, sizeof(md.md_magic)); md.md_version = G_MIRROR_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_mid = arc4random(); md.md_all = *nargs - 1; md.md_genid = 0; md.md_syncid = 1; md.md_sync_offset = 0; val = gctl_get_paraml(req, "slice", sizeof(*val)); if (val == NULL) { gctl_error(req, "No slice argument."); return; } md.md_slice = *val; sval = gctl_get_asciiparam(req, "balance"); if (sval == NULL) { gctl_error(req, "No balance argument."); return; } bal = balance_id(sval); if (bal < 0) { gctl_error(req, "Invalid balance algorithm."); return; } md.md_balance = bal; md.md_mflags = 0; md.md_dflags = 0; ival = gctl_get_paraml(req, "noautosync", sizeof(*ival)); if (ival != NULL && *ival) md.md_mflags |= G_MIRROR_DEVICE_FLAG_NOAUTOSYNC; ival = gctl_get_paraml(req, "nofailsync", sizeof(*ival)); if (ival != NULL && *ival) md.md_mflags |= G_MIRROR_DEVICE_FLAG_NOFAILSYNC; /* These fields not used in manual mode. */ bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = 0; g_topology_lock(); mediasize = OFF_MAX; sectorsize = 0; gp = g_new_geomf(mp, "%s", md.md_name); gp->orphan = g_mirror_create_orphan; cp = g_new_consumer(gp); for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); err: g_destroy_consumer(cp); g_destroy_geom(gp); g_topology_unlock(); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_MIRROR_DEBUG(1, "Disk %s is invalid.", name); gctl_error(req, "Disk %s is invalid.", name); goto err; } g_attach(cp, pp); if (g_access(cp, 1, 0, 0) != 0) { G_MIRROR_DEBUG(1, "Can't open disk %s.", name); gctl_error(req, "Can't open disk %s.", name); err2: g_detach(cp); goto err; } if (pp->mediasize == 0 || pp->sectorsize == 0) { G_MIRROR_DEBUG(1, "Disk %s has no media.", name); gctl_error(req, "Disk %s has no media.", name); g_access(cp, -1, 0, 0); goto err2; } if (pp->mediasize < mediasize) mediasize = pp->mediasize; if (pp->sectorsize > sectorsize) sectorsize = pp->sectorsize; g_access(cp, -1, 0, 0); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); md.md_mediasize = mediasize; md.md_sectorsize = sectorsize; md.md_mediasize -= (md.md_mediasize % md.md_sectorsize); gp = g_mirror_create(mp, &md, G_MIRROR_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't create %s.", md.md_name); g_topology_unlock(); return; } sc = gp->softc; g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_MIRROR_DEBUG(1, "Provider %s disappear?!", name); sbuf_printf(sb, " %s", name); continue; } md.md_did = arc4random(); md.md_priority = no - 1; if (g_mirror_add_disk(sc, pp, &md) != 0) { G_MIRROR_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if (md.md_all != attached || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_HARD); gctl_error(req, "%s", sbuf_data(sb)); } else sx_xunlock(&sc->sc_lock); sbuf_delete(sb); } static void g_mirror_ctl_rebuild(struct gctl_req *req, struct g_class *mp) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_provider *pp; const char *name; char param[16]; int error, *nargs; u_int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } for (i = 1; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } disk = g_mirror_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); continue; } if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1 && disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) { /* * This is the last active disk. There will be nothing * to rebuild it from, so deny this request. */ gctl_error(req, "Provider %s is the last active provider in %s.", name, sc->sc_geom->name); break; } /* * Do rebuild by resetting syncid, disconnecting the disk and * connecting it again. */ disk->d_sync.ds_syncid = 0; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_update_metadata(disk); pp = disk->d_consumer->provider; g_topology_lock(); error = g_mirror_read_metadata(disk->d_consumer, &md); g_topology_unlock(); g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_WAIT); if (error != 0) { gctl_error(req, "Cannot read metadata from %s.", pp->name); continue; } error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { gctl_error(req, "Cannot reconnect component %s.", pp->name); continue; } } sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_insert(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_metadata md; struct g_provider *pp; struct g_consumer *cp; intmax_t *priority; const char *name; char param[16]; u_char *sector; u_int i, n; int error, *nargs, *hardcode, *inactive; struct { struct g_provider *provider; struct g_consumer *consumer; } *disks; off_t mdsize; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } priority = gctl_get_paraml(req, "priority", sizeof(*priority)); if (priority == NULL) { gctl_error(req, "No '%s' argument.", "priority"); return; } inactive = gctl_get_paraml(req, "inactive", sizeof(*inactive)); if (inactive == NULL) { gctl_error(req, "No '%s' argument.", "inactive"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "No '%s' argument.", "hardcode"); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_mirror_ndisks(sc, -1) < sc->sc_ndisks) { gctl_error(req, "Not all disks connected."); sx_xunlock(&sc->sc_lock); return; } disks = g_malloc(sizeof(*disks) * (*nargs), M_WAITOK | M_ZERO); g_topology_lock(); for (i = 1, n = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } if (g_mirror_find_disk(sc, name) != NULL) { gctl_error(req, "Provider %s already inserted.", name); continue; } if (strncmp(name, "/dev/", 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Unknown provider %s.", name); continue; } cp = g_new_consumer(sc->sc_geom); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); gctl_error(req, "Cannot attach to provider %s.", name); continue; } if (g_access(cp, 0, 1, 1) != 0) { gctl_error(req, "Cannot access provider %s.", name); err: g_detach(cp); g_destroy_consumer(cp); continue; } mdsize = (sc->sc_type == G_MIRROR_TYPE_AUTOMATIC) ? pp->sectorsize : 0; if (sc->sc_provider->mediasize > pp->mediasize - mdsize) { gctl_error(req, "Provider %s too small.", name); err2: g_access(cp, 0, -1, -1); goto err; } if ((sc->sc_provider->sectorsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid sectorsize of provider %s.", name); goto err2; } if (sc->sc_type != G_MIRROR_TYPE_AUTOMATIC) { g_access(cp, 0, -1, -1); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); sc->sc_ndisks++; g_mirror_fill_metadata(sc, NULL, &md); md.md_priority = *priority; if (*inactive) md.md_dflags |= G_MIRROR_DISK_FLAG_INACTIVE; if (g_mirror_add_disk(sc, pp, &md) != 0) { sc->sc_ndisks--; gctl_error(req, "Disk %s not inserted.", name); } g_topology_lock(); continue; } disks[n].provider = pp; disks[n].consumer = cp; n++; } if (n == 0) { g_topology_unlock(); sx_xunlock(&sc->sc_lock); g_free(disks); return; } sc->sc_ndisks += n; again: for (i = 0; i < n; i++) { if (disks[i].consumer == NULL) continue; g_mirror_fill_metadata(sc, NULL, &md); md.md_priority = *priority; if (*inactive) md.md_dflags |= G_MIRROR_DISK_FLAG_INACTIVE; pp = disks[i].provider; if (*hardcode) { strlcpy(md.md_provider, pp->name, sizeof(md.md_provider)); } else { bzero(md.md_provider, sizeof(md.md_provider)); } md.md_provsize = pp->mediasize; sector = g_malloc(pp->sectorsize, M_WAITOK); mirror_metadata_encode(&md, sector); error = g_write_data(disks[i].consumer, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); g_free(sector); if (error != 0) { gctl_error(req, "Cannot store metadata on %s.", pp->name); g_access(disks[i].consumer, 0, -1, -1); g_detach(disks[i].consumer); g_destroy_consumer(disks[i].consumer); disks[i].consumer = NULL; disks[i].provider = NULL; sc->sc_ndisks--; goto again; } } g_topology_unlock(); if (i == 0) { /* All writes failed. */ sx_xunlock(&sc->sc_lock); g_free(disks); return; } LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } /* * Release provider and wait for retaste. */ g_topology_lock(); for (i = 0; i < n; i++) { if (disks[i].consumer == NULL) continue; g_access(disks[i].consumer, 0, -1, -1); g_detach(disks[i].consumer); g_destroy_consumer(disks[i].consumer); } g_topology_unlock(); sx_xunlock(&sc->sc_lock); g_free(disks); } static void g_mirror_ctl_remove(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name; char param[16]; int *nargs; u_int i, active; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_mirror_ndisks(sc, -1) < sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Not all disks connected. Try 'forget' command " "first."); return; } active = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); for (i = 1; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } disk = g_mirror_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); continue; } if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) { if (active > 1) active--; else { gctl_error(req, "%s: Can't remove the last " "ACTIVE component %s.", sc->sc_geom->name, name); continue; } } g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DESTROY, G_MIRROR_EVENT_DONTWAIT); } sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_resize(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; uint64_t mediasize; const char *name, *s; char *x; int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Missing device."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } s = gctl_get_asciiparam(req, "size"); if (s == NULL) { gctl_error(req, "No '%s' argument.", "size"); return; } mediasize = strtouq(s, &x, 0); if (*x != '\0' || mediasize == 0) { gctl_error(req, "Invalid '%s' argument.", "size"); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } /* Deny shrinking of an opened provider */ if ((g_debugflags & 16) == 0 && sc->sc_provider_open > 0) { if (sc->sc_mediasize > mediasize) { gctl_error(req, "Device %s is busy.", sc->sc_provider->name); sx_xunlock(&sc->sc_lock); return; } } LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (mediasize > disk->d_consumer->provider->mediasize - disk->d_consumer->provider->sectorsize) { gctl_error(req, "Provider %s is too small.", disk->d_name); sx_xunlock(&sc->sc_lock); return; } } /* Update the size. */ sc->sc_mediasize = mediasize; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } g_topology_lock(); g_resize_provider(sc->sc_provider, mediasize); g_topology_unlock(); sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_deactivate(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name; char param[16]; int *nargs; u_int i, active; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Too few arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } active = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); for (i = 1; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); continue; } disk = g_mirror_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); continue; } if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) { if (active > 1) active--; else { gctl_error(req, "%s: Can't deactivate the " "last ACTIVE component %s.", sc->sc_geom->name, name); continue; } } disk->d_flags |= G_MIRROR_DISK_FLAG_INACTIVE; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_update_metadata(disk); sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } sx_xunlock(&sc->sc_lock); } static void g_mirror_ctl_forget(struct gctl_req *req, struct g_class *mp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; const char *name; char param[16]; int *nargs; u_int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_mirror_ndisks(sc, -1) == sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); G_MIRROR_DEBUG(1, "All disks connected in %s, skipping.", sc->sc_name); continue; } sc->sc_ndisks = g_mirror_ndisks(sc, -1); LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } sx_xunlock(&sc->sc_lock); } } static void g_mirror_ctl_stop(struct gctl_req *req, struct g_class *mp, int wipe) { struct g_mirror_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; int how; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } if (*force) how = G_MIRROR_DESTROY_HARD; else how = G_MIRROR_DESTROY_SOFT; for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_mirror_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } g_cancel_event(sc); if (wipe) sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WIPE; error = g_mirror_destroy(sc, how); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_geom->name, error); if (wipe) sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_WIPE; sx_xunlock(&sc->sc_lock); return; } /* No need to unlock, because lock is already dead. */ } } void g_mirror_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_MIRROR_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } g_topology_unlock(); if (strcmp(verb, "configure") == 0) g_mirror_ctl_configure(req, mp); else if (strcmp(verb, "create") == 0) g_mirror_ctl_create(req, mp); else if (strcmp(verb, "rebuild") == 0) g_mirror_ctl_rebuild(req, mp); else if (strcmp(verb, "insert") == 0) g_mirror_ctl_insert(req, mp); else if (strcmp(verb, "remove") == 0) g_mirror_ctl_remove(req, mp); else if (strcmp(verb, "resize") == 0) g_mirror_ctl_resize(req, mp); else if (strcmp(verb, "deactivate") == 0) g_mirror_ctl_deactivate(req, mp); else if (strcmp(verb, "forget") == 0) g_mirror_ctl_forget(req, mp); else if (strcmp(verb, "stop") == 0) g_mirror_ctl_stop(req, mp, 0); else if (strcmp(verb, "destroy") == 0) g_mirror_ctl_stop(req, mp, 1); else gctl_error(req, "Unknown verb."); g_topology_lock(); } Index: head/sys/geom/mountver/g_mountver.c =================================================================== --- head/sys/geom/mountver/g_mountver.c (revision 326269) +++ head/sys/geom/mountver/g_mountver.c (revision 326270) @@ -1,660 +1,662 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Edward Tomasz Napierala * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mountver, CTLFLAG_RW, 0, "GEOM_MOUNTVER stuff"); static u_int g_mountver_debug = 0; static u_int g_mountver_check_ident = 1; SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, debug, CTLFLAG_RW, &g_mountver_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, check_ident, CTLFLAG_RW, &g_mountver_check_ident, 0, "Check disk ident when reattaching"); static eventhandler_tag g_mountver_pre_sync = NULL; static void g_mountver_queue(struct bio *bp); static void g_mountver_orphan(struct g_consumer *cp); static void g_mountver_resize(struct g_consumer *cp); static int g_mountver_destroy(struct g_geom *gp, boolean_t force); static g_taste_t g_mountver_taste; static int g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_mountver_config(struct gctl_req *req, struct g_class *mp, const char *verb); static void g_mountver_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mountver_init(struct g_class *mp); static void g_mountver_fini(struct g_class *mp); struct g_class g_mountver_class = { .name = G_MOUNTVER_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mountver_config, .taste = g_mountver_taste, .destroy_geom = g_mountver_destroy_geom, .init = g_mountver_init, .fini = g_mountver_fini }; static void g_mountver_done(struct bio *bp) { struct g_geom *gp; struct bio *pbp; if (bp->bio_error != ENXIO) { g_std_done(bp); return; } /* * When the device goes away, it's possible that few requests * will be completed with ENXIO before g_mountver_orphan() * gets called. To work around that, we have to queue requests * that failed with ENXIO, in order to send them later. */ gp = bp->bio_from->geom; pbp = bp->bio_parent; KASSERT(pbp->bio_to == LIST_FIRST(&gp->provider), ("parent request was for someone else")); g_destroy_bio(bp); pbp->bio_inbed++; g_mountver_queue(pbp); } static void g_mountver_send(struct bio *bp) { struct g_geom *gp; struct bio *cbp; gp = bp->bio_to->geom; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_mountver_done; g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static void g_mountver_queue(struct bio *bp) { struct g_mountver_softc *sc; struct g_geom *gp; gp = bp->bio_to->geom; sc = gp->softc; mtx_lock(&sc->sc_mtx); TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue); mtx_unlock(&sc->sc_mtx); } static void g_mountver_send_queued(struct g_geom *gp) { struct g_mountver_softc *sc; struct bio *bp; sc = gp->softc; mtx_lock(&sc->sc_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); G_MOUNTVER_LOGREQ(bp, "Sending queued request."); g_mountver_send(bp); } mtx_unlock(&sc->sc_mtx); } static void g_mountver_discard_queued(struct g_geom *gp) { struct g_mountver_softc *sc; struct bio *bp; sc = gp->softc; mtx_lock(&sc->sc_mtx); while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) { TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue); G_MOUNTVER_LOGREQ(bp, "Discarding queued request."); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_mtx); } static void g_mountver_start(struct bio *bp) { struct g_mountver_softc *sc; struct g_geom *gp; gp = bp->bio_to->geom; sc = gp->softc; G_MOUNTVER_LOGREQ(bp, "Request received."); /* * It is possible that some bios were returned with ENXIO, even though * orphaning didn't happen yet. In that case, queue all subsequent * requests in order to maintain ordering. */ if (sc->sc_orphaned || !TAILQ_EMPTY(&sc->sc_queue)) { if (sc->sc_shutting_down) { G_MOUNTVER_LOGREQ(bp, "Discarding request due to shutdown."); g_io_deliver(bp, ENXIO); return; } G_MOUNTVER_LOGREQ(bp, "Queueing request."); g_mountver_queue(bp); if (!sc->sc_orphaned) g_mountver_send_queued(gp); } else { G_MOUNTVER_LOGREQ(bp, "Sending request."); g_mountver_send(bp); } } static int g_mountver_access(struct g_provider *pp, int dr, int dw, int de) { struct g_mountver_softc *sc; struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = pp->geom; cp = LIST_FIRST(&gp->consumer); sc = gp->softc; if (sc == NULL && dr <= 0 && dw <= 0 && de <= 0) return (0); KASSERT(sc != NULL, ("Trying to access withered provider \"%s\".", pp->name)); sc->sc_access_r += dr; sc->sc_access_w += dw; sc->sc_access_e += de; if (sc->sc_orphaned) return (0); return (g_access(cp, dr, dw, de)); } static int g_mountver_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp) { struct g_mountver_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; char name[64]; int error; int identsize = DISK_IDENT_SIZE; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; snprintf(name, sizeof(name), "%s%s", pp->name, G_MOUNTVER_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Provider %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "gmountver", NULL, MTX_DEF | MTX_RECURSE); TAILQ_INIT(&sc->sc_queue); sc->sc_provider_name = strdup(pp->name, M_GEOM); gp->softc = sc; gp->start = g_mountver_start; gp->orphan = g_mountver_orphan; gp->resize = g_mountver_resize; gp->access = g_mountver_access; gp->dumpconf = g_mountver_dumpconf; newpp = g_new_providerf(gp, "%s", gp->name); newpp->mediasize = pp->mediasize; newpp->sectorsize = pp->sectorsize; newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0) { G_MOUNTVER_DEBUG(0, "Unmapped supported for %s.", gp->name); newpp->flags |= G_PF_ACCEPT_UNMAPPED; } else { G_MOUNTVER_DEBUG(0, "Unmapped unsupported for %s.", gp->name); newpp->flags &= ~G_PF_ACCEPT_UNMAPPED; } cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } error = g_access(cp, 1, 0, 0); if (error != 0) { gctl_error(req, "Cannot access provider %s.", pp->name); goto fail; } error = g_io_getattr("GEOM::ident", cp, &identsize, sc->sc_ident); g_access(cp, -1, 0, 0); if (error != 0) { if (g_mountver_check_ident) { gctl_error(req, "Cannot get disk ident from %s; error = %d.", pp->name, error); goto fail; } G_MOUNTVER_DEBUG(0, "Cannot get disk ident from %s; error = %d.", pp->name, error); sc->sc_ident[0] = '\0'; } g_error_provider(newpp, 0); G_MOUNTVER_DEBUG(0, "Device %s created.", gp->name); return (0); fail: g_free(sc->sc_provider_name); if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_provider(newpp); g_free(gp->softc); g_destroy_geom(gp); return (error); } static int g_mountver_destroy(struct g_geom *gp, boolean_t force) { struct g_mountver_softc *sc; struct g_provider *pp; g_topology_assert(); if (gp->softc == NULL) return (ENXIO); sc = gp->softc; pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_MOUNTVER_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_MOUNTVER_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_MOUNTVER_DEBUG(0, "Device %s removed.", gp->name); } if (pp != NULL) g_wither_provider(pp, ENXIO); g_mountver_discard_queued(gp); g_free(sc->sc_provider_name); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static int g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_mountver_destroy(gp, 0)); } static void g_mountver_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_MOUNTVER_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } if (g_mountver_create(req, mp, pp) != 0) return; } } static struct g_geom * g_mountver_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_mountver_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); gp = g_mountver_find_geom(mp, name); if (gp == NULL) { G_MOUNTVER_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_mountver_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); return; } } } static void g_mountver_orphan(struct g_consumer *cp) { struct g_mountver_softc *sc; g_topology_assert(); sc = cp->geom->softc; sc->sc_orphaned = 1; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); G_MOUNTVER_DEBUG(0, "%s is offline. Mount verification in progress.", sc->sc_provider_name); } static void g_mountver_resize(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_resize_provider(pp, cp->provider->mediasize); } static int g_mountver_ident_matches(struct g_geom *gp) { struct g_consumer *cp; struct g_mountver_softc *sc; char ident[DISK_IDENT_SIZE]; int error, identsize = DISK_IDENT_SIZE; sc = gp->softc; cp = LIST_FIRST(&gp->consumer); if (g_mountver_check_ident == 0) return (0); error = g_access(cp, 1, 0, 0); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot access %s; " "not attaching; error = %d.", gp->name, error); return (1); } error = g_io_getattr("GEOM::ident", cp, &identsize, ident); g_access(cp, -1, 0, 0); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot get disk ident for %s; " "not attaching; error = %d.", gp->name, error); return (1); } if (strcmp(ident, sc->sc_ident) != 0) { G_MOUNTVER_DEBUG(1, "Disk ident for %s (\"%s\") is different " "from expected \"%s\", not attaching.", gp->name, ident, sc->sc_ident); return (1); } return (0); } static struct g_geom * g_mountver_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mountver_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MOUNTVER_DEBUG(2, "Tasting %s.", pp->name); /* * Let's check if device already exists. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; /* Already attached? */ if (pp == LIST_FIRST(&gp->provider)) return (NULL); if (sc->sc_orphaned && strcmp(pp->name, sc->sc_provider_name) == 0) break; } if (gp == NULL) return (NULL); cp = LIST_FIRST(&gp->consumer); g_attach(cp, pp); error = g_mountver_ident_matches(gp); if (error != 0) { g_detach(cp); return (NULL); } if (sc->sc_access_r > 0 || sc->sc_access_w > 0 || sc->sc_access_e > 0) { error = g_access(cp, sc->sc_access_r, sc->sc_access_w, sc->sc_access_e); if (error != 0) { G_MOUNTVER_DEBUG(0, "Cannot access %s; error = %d.", pp->name, error); g_detach(cp); return (NULL); } } g_mountver_send_queued(gp); sc->sc_orphaned = 0; G_MOUNTVER_DEBUG(0, "%s has completed mount verification.", sc->sc_provider_name); return (gp); } static void g_mountver_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_MOUNTVER_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_mountver_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_mountver_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_mountver_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mountver_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%s\n", indent, sc->sc_orphaned ? "OFFLINE" : "ONLINE"); sbuf_printf(sb, "%s%s\n", indent, sc->sc_provider_name); sbuf_printf(sb, "%s%s\n", indent, sc->sc_ident); } static void g_mountver_shutdown_pre_sync(void *arg, int howto) { struct g_mountver_softc *sc; struct g_class *mp; struct g_geom *gp, *gp2; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; sc = gp->softc; sc->sc_shutting_down = 1; if (sc->sc_orphaned) g_mountver_destroy(gp, 1); } g_topology_unlock(); } static void g_mountver_init(struct g_class *mp) { g_mountver_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_mountver_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mountver_pre_sync == NULL) G_MOUNTVER_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mountver_fini(struct g_class *mp) { if (g_mountver_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_mountver_pre_sync); } DECLARE_GEOM_CLASS(g_mountver_class, g_mountver); Index: head/sys/geom/mountver/g_mountver.h =================================================================== --- head/sys/geom/mountver/g_mountver.h (revision 326269) +++ head/sys/geom/mountver/g_mountver.h (revision 326270) @@ -1,72 +1,74 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Edward Tomasz Napierala * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_MOUNTVER_H_ #define _G_MOUNTVER_H_ #define G_MOUNTVER_CLASS_NAME "MOUNTVER" #define G_MOUNTVER_VERSION 4 #define G_MOUNTVER_SUFFIX ".mountver" #ifdef _KERNEL #define G_MOUNTVER_DEBUG(lvl, ...) do { \ if (g_mountver_debug >= (lvl)) { \ printf("GEOM_MOUNTVER"); \ if (g_mountver_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_MOUNTVER_LOGREQ(bp, ...) do { \ if (g_mountver_debug >= 2) { \ printf("GEOM_MOUNTVER[2]: "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) struct g_mountver_softc { TAILQ_HEAD(, bio) sc_queue; struct mtx sc_mtx; char *sc_provider_name; char sc_ident[DISK_IDENT_SIZE]; int sc_orphaned; int sc_shutting_down; int sc_access_r; int sc_access_w; int sc_access_e; }; #endif /* _KERNEL */ #endif /* _G_MOUNTVER_H_ */ Index: head/sys/geom/multipath/g_multipath.c =================================================================== --- head/sys/geom/multipath/g_multipath.c (revision 326269) +++ head/sys/geom/multipath/g_multipath.c (revision 326270) @@ -1,1532 +1,1534 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011-2013 Alexander Motin * Copyright (c) 2006-2007 Matthew Jacob * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Based upon work by Pawel Jakub Dawidek for all of the * fine geom examples, and by Poul Henning Kamp for GEOM * itself, all of which is most gratefully acknowledged. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_multipath, "GEOM multipath support"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, multipath, CTLFLAG_RW, 0, "GEOM_MULTIPATH tunables"); static u_int g_multipath_debug = 0; SYSCTL_UINT(_kern_geom_multipath, OID_AUTO, debug, CTLFLAG_RW, &g_multipath_debug, 0, "Debug level"); static u_int g_multipath_exclusive = 1; SYSCTL_UINT(_kern_geom_multipath, OID_AUTO, exclusive, CTLFLAG_RW, &g_multipath_exclusive, 0, "Exclusively open providers"); static enum { GKT_NIL, GKT_RUN, GKT_DIE } g_multipath_kt_state; static struct bio_queue_head gmtbq; static struct mtx gmtbq_mtx; static int g_multipath_read_metadata(struct g_consumer *cp, struct g_multipath_metadata *md); static int g_multipath_write_metadata(struct g_consumer *cp, struct g_multipath_metadata *md); static void g_multipath_orphan(struct g_consumer *); static void g_multipath_resize(struct g_consumer *); static void g_multipath_start(struct bio *); static void g_multipath_done(struct bio *); static void g_multipath_done_error(struct bio *); static void g_multipath_kt(void *); static int g_multipath_destroy(struct g_geom *); static int g_multipath_destroy_geom(struct gctl_req *, struct g_class *, struct g_geom *); static struct g_geom *g_multipath_find_geom(struct g_class *, const char *); static int g_multipath_rotate(struct g_geom *); static g_taste_t g_multipath_taste; static g_ctl_req_t g_multipath_config; static g_init_t g_multipath_init; static g_fini_t g_multipath_fini; static g_dumpconf_t g_multipath_dumpconf; struct g_class g_multipath_class = { .name = G_MULTIPATH_CLASS_NAME, .version = G_VERSION, .ctlreq = g_multipath_config, .taste = g_multipath_taste, .destroy_geom = g_multipath_destroy_geom, .init = g_multipath_init, .fini = g_multipath_fini }; #define MP_FAIL 0x00000001 #define MP_LOST 0x00000002 #define MP_NEW 0x00000004 #define MP_POSTED 0x00000008 #define MP_BAD (MP_FAIL | MP_LOST | MP_NEW) #define MP_WITHER 0x00000010 #define MP_IDLE 0x00000020 #define MP_IDLE_MASK 0xffffffe0 static int g_multipath_good(struct g_geom *gp) { struct g_consumer *cp; int n = 0; LIST_FOREACH(cp, &gp->consumer, consumer) { if ((cp->index & MP_BAD) == 0) n++; } return (n); } static void g_multipath_fault(struct g_consumer *cp, int cause) { struct g_multipath_softc *sc; struct g_consumer *lcp; struct g_geom *gp; gp = cp->geom; sc = gp->softc; cp->index |= cause; if (g_multipath_good(gp) == 0 && sc->sc_ndisks > 0) { LIST_FOREACH(lcp, &gp->consumer, consumer) { if (lcp->provider == NULL || (lcp->index & (MP_LOST | MP_NEW))) continue; if (sc->sc_ndisks > 1 && lcp == cp) continue; printf("GEOM_MULTIPATH: " "all paths in %s were marked FAIL, restore %s\n", sc->sc_name, lcp->provider->name); lcp->index &= ~MP_FAIL; } } if (cp != sc->sc_active) return; sc->sc_active = NULL; LIST_FOREACH(lcp, &gp->consumer, consumer) { if ((lcp->index & MP_BAD) == 0) { sc->sc_active = lcp; break; } } if (sc->sc_active == NULL) { printf("GEOM_MULTIPATH: out of providers for %s\n", sc->sc_name); } else if (sc->sc_active_active != 1) { printf("GEOM_MULTIPATH: %s is now active path in %s\n", sc->sc_active->provider->name, sc->sc_name); } } static struct g_consumer * g_multipath_choose(struct g_geom *gp, struct bio *bp) { struct g_multipath_softc *sc; struct g_consumer *best, *cp; sc = gp->softc; if (sc->sc_active_active == 0 || (sc->sc_active_active == 2 && bp->bio_cmd != BIO_READ)) return (sc->sc_active); best = NULL; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_BAD) continue; cp->index += MP_IDLE; if (best == NULL || cp->private < best->private || (cp->private == best->private && cp->index > best->index)) best = cp; } if (best != NULL) best->index &= ~MP_IDLE_MASK; return (best); } static void g_mpd(void *arg, int flags __unused) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; int w; g_topology_assert(); cp = arg; gp = cp->geom; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) { w = cp->acw; g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (w > 0 && cp->provider != NULL && (cp->provider->geom->flags & G_GEOM_WITHER) == 0) { cp->index |= MP_WITHER; g_post_event(g_mpd, cp, M_WAITOK, NULL); return; } } sc = gp->softc; mtx_lock(&sc->sc_mtx); if (cp->provider) { printf("GEOM_MULTIPATH: %s removed from %s\n", cp->provider->name, gp->name); g_detach(cp); } g_destroy_consumer(cp); mtx_unlock(&sc->sc_mtx); if (LIST_EMPTY(&gp->consumer)) g_multipath_destroy(gp); } static void g_multipath_orphan(struct g_consumer *cp) { struct g_multipath_softc *sc; uintptr_t *cnt; g_topology_assert(); printf("GEOM_MULTIPATH: %s in %s was disconnected\n", cp->provider->name, cp->geom->name); sc = cp->geom->softc; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); sc->sc_ndisks--; g_multipath_fault(cp, MP_LOST); if (*cnt == 0 && (cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); } else mtx_unlock(&sc->sc_mtx); } static void g_multipath_resize(struct g_consumer *cp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp1; struct g_provider *pp; struct g_multipath_metadata md; off_t size, psize, ssize; int error; g_topology_assert(); gp = cp->geom; pp = cp->provider; sc = gp->softc; if (sc->sc_stopping) return; if (pp->mediasize < sc->sc_size) { size = pp->mediasize; ssize = pp->sectorsize; } else { size = ssize = OFF_MAX; mtx_lock(&sc->sc_mtx); LIST_FOREACH(cp1, &gp->consumer, consumer) { pp = cp1->provider; if (pp == NULL) continue; if (pp->mediasize < size) { size = pp->mediasize; ssize = pp->sectorsize; } } mtx_unlock(&sc->sc_mtx); if (size == OFF_MAX || size == sc->sc_size) return; } psize = size - ((sc->sc_uuid[0] != 0) ? ssize : 0); printf("GEOM_MULTIPATH: %s size changed from %jd to %jd\n", sc->sc_name, sc->sc_pp->mediasize, psize); if (sc->sc_uuid[0] != 0 && size < sc->sc_size) { error = g_multipath_read_metadata(cp, &md); if (error || (strcmp(md.md_magic, G_MULTIPATH_MAGIC) != 0) || (memcmp(md.md_uuid, sc->sc_uuid, sizeof(sc->sc_uuid)) != 0) || (strcmp(md.md_name, sc->sc_name) != 0) || (md.md_size != 0 && md.md_size != size) || (md.md_sectorsize != 0 && md.md_sectorsize != ssize)) { g_multipath_destroy(gp); return; } } sc->sc_size = size; g_resize_provider(sc->sc_pp, psize); if (sc->sc_uuid[0] != 0) { pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = size; md.md_sectorsize = ssize; md.md_active_active = sc->sc_active_active; error = g_multipath_write_metadata(cp, &md); if (error != 0) printf("GEOM_MULTIPATH: Can't update metadata on %s " "(%d)\n", pp->name, error); } } static void g_multipath_start(struct bio *bp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct bio *cbp; uintptr_t *cnt; gp = bp->bio_to->geom; sc = gp->softc; KASSERT(sc != NULL, ("NULL sc")); cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } mtx_lock(&sc->sc_mtx); cp = g_multipath_choose(gp, bp); if (cp == NULL) { mtx_unlock(&sc->sc_mtx); g_destroy_bio(cbp); g_io_deliver(bp, ENXIO); return; } if ((uintptr_t)bp->bio_driver1 < sc->sc_ndisks) bp->bio_driver1 = (void *)(uintptr_t)sc->sc_ndisks; cnt = (uintptr_t *)&cp->private; (*cnt)++; mtx_unlock(&sc->sc_mtx); cbp->bio_done = g_multipath_done; g_io_request(cbp, cp); } static void g_multipath_done(struct bio *bp) { struct g_multipath_softc *sc; struct g_consumer *cp; uintptr_t *cnt; if (bp->bio_error == ENXIO || bp->bio_error == EIO) { mtx_lock(&gmtbq_mtx); bioq_insert_tail(&gmtbq, bp); mtx_unlock(&gmtbq_mtx); wakeup(&g_multipath_kt_state); } else { cp = bp->bio_from; sc = cp->geom->softc; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); (*cnt)--; if (*cnt == 0 && (cp->index & MP_LOST)) { if (g_post_event(g_mpd, cp, M_NOWAIT, NULL) == 0) cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); } else mtx_unlock(&sc->sc_mtx); g_std_done(bp); } } static void g_multipath_done_error(struct bio *bp) { struct bio *pbp; struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_provider *pp; uintptr_t *cnt; /* * If we had a failure, we have to check first to see * whether the consumer it failed on was the currently * active consumer (i.e., this is the first in perhaps * a number of failures). If so, we then switch consumers * to the next available consumer. */ pbp = bp->bio_parent; gp = pbp->bio_to->geom; sc = gp->softc; cp = bp->bio_from; pp = cp->provider; cnt = (uintptr_t *)&cp->private; mtx_lock(&sc->sc_mtx); if ((cp->index & MP_FAIL) == 0) { printf("GEOM_MULTIPATH: Error %d, %s in %s marked FAIL\n", bp->bio_error, pp->name, sc->sc_name); g_multipath_fault(cp, MP_FAIL); } (*cnt)--; if (*cnt == 0 && (cp->index & (MP_LOST | MP_POSTED)) == MP_LOST) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_post_event(g_mpd, cp, M_WAITOK, NULL); } else mtx_unlock(&sc->sc_mtx); /* * If we can fruitfully restart the I/O, do so. */ if (pbp->bio_children < (uintptr_t)pbp->bio_driver1) { pbp->bio_inbed++; g_destroy_bio(bp); g_multipath_start(pbp); } else { g_std_done(bp); } } static void g_multipath_kt(void *arg) { g_multipath_kt_state = GKT_RUN; mtx_lock(&gmtbq_mtx); while (g_multipath_kt_state == GKT_RUN) { for (;;) { struct bio *bp; bp = bioq_takefirst(&gmtbq); if (bp == NULL) break; mtx_unlock(&gmtbq_mtx); g_multipath_done_error(bp); mtx_lock(&gmtbq_mtx); } if (g_multipath_kt_state != GKT_RUN) break; msleep(&g_multipath_kt_state, &gmtbq_mtx, PRIBIO, "gkt:wait", 0); } mtx_unlock(&gmtbq_mtx); wakeup(&g_multipath_kt_state); kproc_exit(0); } static int g_multipath_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp, *badcp = NULL; struct g_multipath_softc *sc; int error; gp = pp->geom; /* Error used if we have no valid consumers. */ error = (dr > 0 || dw > 0 || de > 0) ? ENXIO : 0; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_WITHER) continue; error = g_access(cp, dr, dw, de); if (error) { badcp = cp; goto fail; } } if (error != 0) return (error); sc = gp->softc; sc->sc_opened += dr + dw + de; if (sc->sc_stopping && sc->sc_opened == 0) g_multipath_destroy(gp); return (0); fail: LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp == badcp) break; if (cp->index & MP_WITHER) continue; (void) g_access(cp, -dr, -dw, -de); } return (error); } static struct g_geom * g_multipath_create(struct g_class *mp, struct g_multipath_metadata *md) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strcmp(gp->name, md->md_name) == 0) { printf("GEOM_MULTIPATH: name %s already exists\n", md->md_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "multipath", NULL, MTX_DEF); memcpy(sc->sc_uuid, md->md_uuid, sizeof (sc->sc_uuid)); memcpy(sc->sc_name, md->md_name, sizeof (sc->sc_name)); sc->sc_active_active = md->md_active_active; sc->sc_size = md->md_size; gp->softc = sc; gp->start = g_multipath_start; gp->orphan = g_multipath_orphan; gp->resize = g_multipath_resize; gp->access = g_multipath_access; gp->dumpconf = g_multipath_dumpconf; pp = g_new_providerf(gp, "multipath/%s", md->md_name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (md->md_size != 0) { pp->mediasize = md->md_size - ((md->md_uuid[0] != 0) ? md->md_sectorsize : 0); pp->sectorsize = md->md_sectorsize; } sc->sc_pp = pp; g_error_provider(pp, 0); printf("GEOM_MULTIPATH: %s created\n", gp->name); return (gp); } static int g_multipath_add_disk(struct g_geom *gp, struct g_provider *pp) { struct g_multipath_softc *sc; struct g_consumer *cp, *nxtcp; int error, acr, acw, ace; g_topology_assert(); sc = gp->softc; KASSERT(sc, ("no softc")); /* * Make sure that the passed provider isn't already attached */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) break; } if (cp) { printf("GEOM_MULTIPATH: provider %s already attached to %s\n", pp->name, gp->name); return (EEXIST); } nxtcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; cp->private = NULL; cp->index = MP_NEW; error = g_attach(cp, pp); if (error != 0) { printf("GEOM_MULTIPATH: cannot attach %s to %s", pp->name, sc->sc_name); g_destroy_consumer(cp); return (error); } /* * Set access permissions on new consumer to match other consumers */ if (sc->sc_pp) { acr = sc->sc_pp->acr; acw = sc->sc_pp->acw; ace = sc->sc_pp->ace; } else acr = acw = ace = 0; if (g_multipath_exclusive) { acr++; acw++; ace++; } error = g_access(cp, acr, acw, ace); if (error) { printf("GEOM_MULTIPATH: cannot set access in " "attaching %s to %s (%d)\n", pp->name, sc->sc_name, error); g_detach(cp); g_destroy_consumer(cp); return (error); } if (sc->sc_size == 0) { sc->sc_size = pp->mediasize - ((sc->sc_uuid[0] != 0) ? pp->sectorsize : 0); sc->sc_pp->mediasize = sc->sc_size; sc->sc_pp->sectorsize = pp->sectorsize; } if (sc->sc_pp->stripesize == 0 && sc->sc_pp->stripeoffset == 0) { sc->sc_pp->stripesize = pp->stripesize; sc->sc_pp->stripeoffset = pp->stripeoffset; } sc->sc_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; mtx_lock(&sc->sc_mtx); cp->index = 0; sc->sc_ndisks++; mtx_unlock(&sc->sc_mtx); printf("GEOM_MULTIPATH: %s added to %s\n", pp->name, sc->sc_name); if (sc->sc_active == NULL) { sc->sc_active = cp; if (sc->sc_active_active != 1) printf("GEOM_MULTIPATH: %s is now active path in %s\n", pp->name, sc->sc_name); } return (0); } static int g_multipath_destroy(struct g_geom *gp) { struct g_multipath_softc *sc; struct g_consumer *cp, *cp1; g_topology_assert(); if (gp->softc == NULL) return (ENXIO); sc = gp->softc; if (!sc->sc_stopping) { printf("GEOM_MULTIPATH: destroying %s\n", gp->name); sc->sc_stopping = 1; } if (sc->sc_opened != 0) { g_wither_provider(sc->sc_pp, ENXIO); sc->sc_pp = NULL; return (EINPROGRESS); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { mtx_lock(&sc->sc_mtx); if ((cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); if (cp1 == NULL) return(0); /* Recursion happened. */ } else mtx_unlock(&sc->sc_mtx); } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); mtx_destroy(&sc->sc_mtx); g_free(gp->softc); gp->softc = NULL; printf("GEOM_MULTIPATH: %s destroyed\n", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_multipath_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_multipath_destroy(gp)); } static int g_multipath_rotate(struct g_geom *gp) { struct g_consumer *lcp, *first_good_cp = NULL; struct g_multipath_softc *sc = gp->softc; int active_cp_seen = 0; g_topology_assert(); if (sc == NULL) return (ENXIO); LIST_FOREACH(lcp, &gp->consumer, consumer) { if ((lcp->index & MP_BAD) == 0) { if (first_good_cp == NULL) first_good_cp = lcp; if (active_cp_seen) break; } if (sc->sc_active == lcp) active_cp_seen = 1; } if (lcp == NULL) lcp = first_good_cp; if (lcp && lcp != sc->sc_active) { sc->sc_active = lcp; if (sc->sc_active_active != 1) printf("GEOM_MULTIPATH: %s is now active path in %s\n", lcp->provider->name, sc->sc_name); } return (0); } static void g_multipath_init(struct g_class *mp) { bioq_init(&gmtbq); mtx_init(&gmtbq_mtx, "gmtbq", NULL, MTX_DEF); kproc_create(g_multipath_kt, mp, NULL, 0, 0, "g_mp_kt"); } static void g_multipath_fini(struct g_class *mp) { if (g_multipath_kt_state == GKT_RUN) { mtx_lock(&gmtbq_mtx); g_multipath_kt_state = GKT_DIE; wakeup(&g_multipath_kt_state); msleep(&g_multipath_kt_state, &gmtbq_mtx, PRIBIO, "gmp:fini", 0); mtx_unlock(&gmtbq_mtx); } } static int g_multipath_read_metadata(struct g_consumer *cp, struct g_multipath_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); multipath_metadata_decode(buf, md); g_free(buf); return (0); } static int g_multipath_write_metadata(struct g_consumer *cp, struct g_multipath_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 1, 1); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); multipath_metadata_encode(md, buf); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, -1, -1, -1); g_free(buf); return (error); } static struct g_geom * g_multipath_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_multipath_metadata md; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_geom *gp, *gp1; int error, isnew; g_topology_assert(); gp = g_new_geomf(mp, "multipath:taste"); gp->start = g_multipath_start; gp->access = g_multipath_access; gp->orphan = g_multipath_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_multipath_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_MULTIPATH_MAGIC) != 0) { if (g_multipath_debug) printf("%s is not MULTIPATH\n", pp->name); return (NULL); } if (md.md_version != G_MULTIPATH_VERSION) { printf("%s has version %d multipath id- this module is version " " %d: rejecting\n", pp->name, md.md_version, G_MULTIPATH_VERSION); return (NULL); } if (md.md_size != 0 && md.md_size != pp->mediasize) return (NULL); if (md.md_sectorsize != 0 && md.md_sectorsize != pp->sectorsize) return (NULL); if (g_multipath_debug) printf("MULTIPATH: %s/%s\n", md.md_name, md.md_uuid); /* * Let's check if such a device already is present. We check against * uuid alone first because that's the true distinguishor. If that * passes, then we check for name conflicts. If there are conflicts, * modify the name. * * The whole purpose of this is to solve the problem that people don't * pick good unique names, but good unique names (like uuids) are a * pain to use. So, we allow people to build GEOMs with friendly names * and uuids, and modify the names in case there's a collision. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strncmp(md.md_uuid, sc->sc_uuid, sizeof(md.md_uuid)) == 0) break; } LIST_FOREACH(gp1, &mp->geom, geom) { if (gp1 == gp) continue; sc = gp1->softc; if (sc == NULL || sc->sc_stopping) continue; if (strncmp(md.md_name, sc->sc_name, sizeof(md.md_name)) == 0) break; } /* * If gp is NULL, we had no extant MULTIPATH geom with this uuid. * * If gp1 is *not* NULL, that means we have a MULTIPATH geom extant * with the same name (but a different UUID). * * If gp is NULL, then modify the name with a random number and * complain, but allow the creation of the geom to continue. * * If gp is *not* NULL, just use the geom's name as we're attaching * this disk to the (previously generated) name. */ if (gp1) { sc = gp1->softc; if (gp == NULL) { char buf[16]; u_long rand = random(); snprintf(buf, sizeof (buf), "%s-%lu", md.md_name, rand); printf("GEOM_MULTIPATH: geom %s/%s exists already\n", sc->sc_name, sc->sc_uuid); printf("GEOM_MULTIPATH: %s will be (temporarily) %s\n", md.md_uuid, buf); strlcpy(md.md_name, buf, sizeof(md.md_name)); } else { strlcpy(md.md_name, sc->sc_name, sizeof(md.md_name)); } } if (gp == NULL) { gp = g_multipath_create(mp, &md); if (gp == NULL) { printf("GEOM_MULTIPATH: cannot create geom %s/%s\n", md.md_name, md.md_uuid); return (NULL); } isnew = 1; } else { isnew = 0; } sc = gp->softc; KASSERT(sc != NULL, ("sc is NULL")); error = g_multipath_add_disk(gp, pp); if (error != 0) { if (isnew) g_multipath_destroy(gp); return (NULL); } return (gp); } static void g_multipath_ctl_add_name(struct gctl_req *req, struct g_class *mp, const char *name) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; const char *mpname; static const char devpf[6] = "/dev/"; int error; g_topology_assert(); mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s is invalid", mpname); return; } sc = gp->softc; if (strncmp(name, devpf, 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) { gctl_error(req, "Provider %s is invalid", name); return; } /* * Check to make sure parameters match. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { gctl_error(req, "provider %s is already there", pp->name); return; } } if (sc->sc_pp->mediasize != 0 && sc->sc_pp->mediasize + (sc->sc_uuid[0] != 0 ? pp->sectorsize : 0) != pp->mediasize) { gctl_error(req, "Providers size mismatch %jd != %jd", (intmax_t) sc->sc_pp->mediasize + (sc->sc_uuid[0] != 0 ? pp->sectorsize : 0), (intmax_t) pp->mediasize); return; } if (sc->sc_pp->sectorsize != 0 && sc->sc_pp->sectorsize != pp->sectorsize) { gctl_error(req, "Providers sectorsize mismatch %u != %u", sc->sc_pp->sectorsize, pp->sectorsize); return; } error = g_multipath_add_disk(gp, pp); if (error != 0) gctl_error(req, "Provider addition error: %d", error); } static void g_multipath_ctl_prefer(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; const char *name, *mpname; static const char devpf[6] = "/dev/"; int *nargs; g_topology_assert(); mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s is invalid", mpname); return; } sc = gp->softc; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No 'nargs' argument"); return; } if (*nargs != 2) { gctl_error(req, "missing device"); return; } name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } if (strncmp(name, devpf, 5) == 0) { name += 5; } LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0) break; } if (cp == NULL) { gctl_error(req, "Provider %s not found", name); return; } mtx_lock(&sc->sc_mtx); if (cp->index & MP_BAD) { gctl_error(req, "Consumer %s is invalid", name); mtx_unlock(&sc->sc_mtx); return; } /* Here when the consumer is present and in good shape */ sc->sc_active = cp; if (!sc->sc_active_active) printf("GEOM_MULTIPATH: %s now active path in %s\n", sc->sc_active->provider->name, sc->sc_name); mtx_unlock(&sc->sc_mtx); } static void g_multipath_ctl_add(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; const char *mpname, *name; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } g_multipath_ctl_add_name(req, mp, name); } static void g_multipath_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_multipath_metadata md; struct g_multipath_softc *sc; struct g_geom *gp; const char *mpname, *name; char param[16]; int *nargs, i, *val; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (*nargs < 2) { gctl_error(req, "wrong number of arguments."); return; } mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp != NULL) { gctl_error(req, "Device %s already exist", mpname); return; } memset(&md, 0, sizeof(md)); strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); md.md_version = G_MULTIPATH_VERSION; strlcpy(md.md_name, mpname, sizeof(md.md_name)); md.md_size = 0; md.md_sectorsize = 0; md.md_uuid[0] = 0; md.md_active_active = 0; val = gctl_get_paraml(req, "active_active", sizeof(*val)); if (val != NULL && *val != 0) md.md_active_active = 1; val = gctl_get_paraml(req, "active_read", sizeof(*val)); if (val != NULL && *val != 0) md.md_active_active = 2; gp = g_multipath_create(mp, &md); if (gp == NULL) { gctl_error(req, "GEOM_MULTIPATH: cannot create geom %s/%s\n", md.md_name, md.md_uuid); return; } sc = gp->softc; for (i = 1; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); g_multipath_ctl_add_name(req, mp, name); } if (sc->sc_ndisks != (*nargs - 1)) g_multipath_destroy(gp); } static void g_multipath_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; struct g_multipath_metadata md; const char *name; int error, *val; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; val = gctl_get_paraml(req, "active_active", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 1; val = gctl_get_paraml(req, "active_read", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 2; val = gctl_get_paraml(req, "active_passive", sizeof(*val)); if (val != NULL && *val != 0) sc->sc_active_active = 0; if (sc->sc_uuid[0] != 0 && sc->sc_active != NULL) { cp = sc->sc_active; pp = cp->provider; strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); memcpy(md.md_uuid, sc->sc_uuid, sizeof (sc->sc_uuid)); strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_version = G_MULTIPATH_VERSION; md.md_size = pp->mediasize; md.md_sectorsize = pp->sectorsize; md.md_active_active = sc->sc_active_active; error = g_multipath_write_metadata(cp, &md); if (error != 0) gctl_error(req, "Can't update metadata on %s (%d)", pp->name, error); } } static void g_multipath_ctl_fail(struct gctl_req *req, struct g_class *mp, int fail) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp; const char *mpname, *name; int found; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } found = 0; mtx_lock(&sc->sc_mtx); LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0 && (cp->index & MP_LOST) == 0) { found = 1; if (!fail == !(cp->index & MP_FAIL)) continue; printf("GEOM_MULTIPATH: %s in %s is marked %s.\n", name, sc->sc_name, fail ? "FAIL" : "OK"); if (fail) { g_multipath_fault(cp, MP_FAIL); } else { cp->index &= ~MP_FAIL; } } } mtx_unlock(&sc->sc_mtx); if (found == 0) gctl_error(req, "Provider %s not found", name); } static void g_multipath_ctl_remove(struct gctl_req *req, struct g_class *mp) { struct g_multipath_softc *sc; struct g_geom *gp; struct g_consumer *cp, *cp1; const char *mpname, *name; uintptr_t *cnt; int found; mpname = gctl_get_asciiparam(req, "arg0"); if (mpname == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, mpname); if (gp == NULL) { gctl_error(req, "Device %s not found", mpname); return; } sc = gp->softc; name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg1' argument"); return; } found = 0; mtx_lock(&sc->sc_mtx); LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { if (cp->provider != NULL && strcmp(cp->provider->name, name) == 0 && (cp->index & MP_LOST) == 0) { found = 1; printf("GEOM_MULTIPATH: removing %s from %s\n", cp->provider->name, cp->geom->name); sc->sc_ndisks--; g_multipath_fault(cp, MP_LOST); cnt = (uintptr_t *)&cp->private; if (*cnt == 0 && (cp->index & MP_POSTED) == 0) { cp->index |= MP_POSTED; mtx_unlock(&sc->sc_mtx); g_mpd(cp, 0); if (cp1 == NULL) return; /* Recursion happened. */ mtx_lock(&sc->sc_mtx); } } } mtx_unlock(&sc->sc_mtx); if (found == 0) gctl_error(req, "Provider %s not found", name); } static struct g_geom * g_multipath_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; struct g_multipath_softc *sc; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || sc->sc_stopping) continue; if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_multipath_ctl_stop(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; const char *name; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } error = g_multipath_destroy(gp); if (error != 0 && error != EINPROGRESS) gctl_error(req, "failed to stop %s (err=%d)", name, error); } static void g_multipath_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; struct g_provider *pp; const char *name; uint8_t *buf; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; if (sc->sc_uuid[0] != 0 && sc->sc_active != NULL) { cp = sc->sc_active; pp = cp->provider; error = g_access(cp, 1, 1, 1); if (error != 0) { gctl_error(req, "Can't open %s (%d)", pp->name, error); goto destroy; } g_topology_unlock(); buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, -1, -1, -1); if (error != 0) gctl_error(req, "Can't erase metadata on %s (%d)", pp->name, error); } destroy: error = g_multipath_destroy(gp); if (error != 0 && error != EINPROGRESS) gctl_error(req, "failed to destroy %s (err=%d)", name, error); } static void g_multipath_ctl_rotate(struct gctl_req *req, struct g_class *mp) { struct g_geom *gp; const char *name; int error; g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } error = g_multipath_rotate(gp); if (error != 0) { gctl_error(req, "failed to rotate %s (err=%d)", name, error); } } static void g_multipath_ctl_getactive(struct gctl_req *req, struct g_class *mp) { struct sbuf *sb; struct g_geom *gp; struct g_multipath_softc *sc; struct g_consumer *cp; const char *name; int empty; sb = sbuf_new_auto(); g_topology_assert(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg0' argument"); return; } gp = g_multipath_find_geom(mp, name); if (gp == NULL) { gctl_error(req, "Device %s is invalid", name); return; } sc = gp->softc; if (sc->sc_active_active == 1) { empty = 1; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->index & MP_BAD) continue; if (!empty) sbuf_cat(sb, " "); sbuf_cat(sb, cp->provider->name); empty = 0; } if (empty) sbuf_cat(sb, "none"); sbuf_cat(sb, "\n"); } else if (sc->sc_active && sc->sc_active->provider) { sbuf_printf(sb, "%s\n", sc->sc_active->provider->name); } else { sbuf_printf(sb, "none\n"); } sbuf_finish(sb); gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } static void g_multipath_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No 'version' argument"); } else if (*version != G_MULTIPATH_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync"); } else if (strcmp(verb, "add") == 0) { g_multipath_ctl_add(req, mp); } else if (strcmp(verb, "prefer") == 0) { g_multipath_ctl_prefer(req, mp); } else if (strcmp(verb, "create") == 0) { g_multipath_ctl_create(req, mp); } else if (strcmp(verb, "configure") == 0) { g_multipath_ctl_configure(req, mp); } else if (strcmp(verb, "stop") == 0) { g_multipath_ctl_stop(req, mp); } else if (strcmp(verb, "destroy") == 0) { g_multipath_ctl_destroy(req, mp); } else if (strcmp(verb, "fail") == 0) { g_multipath_ctl_fail(req, mp, 1); } else if (strcmp(verb, "restore") == 0) { g_multipath_ctl_fail(req, mp, 0); } else if (strcmp(verb, "remove") == 0) { g_multipath_ctl_remove(req, mp); } else if (strcmp(verb, "rotate") == 0) { g_multipath_ctl_rotate(req, mp); } else if (strcmp(verb, "getactive") == 0) { g_multipath_ctl_getactive(req, mp); } else { gctl_error(req, "Unknown verb %s", verb); } } static void g_multipath_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_multipath_softc *sc; int good; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (cp != NULL) { sbuf_printf(sb, "%s%s\n", indent, (cp->index & MP_NEW) ? "NEW" : (cp->index & MP_LOST) ? "LOST" : (cp->index & MP_FAIL) ? "FAIL" : (sc->sc_active_active == 1 || sc->sc_active == cp) ? "ACTIVE" : sc->sc_active_active == 2 ? "READ" : "PASSIVE"); } else { good = g_multipath_good(gp); sbuf_printf(sb, "%s%s\n", indent, good == 0 ? "BROKEN" : (good != sc->sc_ndisks || sc->sc_ndisks == 1) ? "DEGRADED" : "OPTIMAL"); } if (cp == NULL && pp == NULL) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_uuid); sbuf_printf(sb, "%sActive/%s\n", indent, sc->sc_active_active == 2 ? "Read" : sc->sc_active_active == 1 ? "Active" : "Passive"); sbuf_printf(sb, "%s%s\n", indent, sc->sc_uuid[0] == 0 ? "MANUAL" : "AUTOMATIC"); } } DECLARE_GEOM_CLASS(g_multipath_class, g_multipath); Index: head/sys/geom/multipath/g_multipath.h =================================================================== --- head/sys/geom/multipath/g_multipath.h (revision 326269) +++ head/sys/geom/multipath/g_multipath.h (revision 326270) @@ -1,110 +1,112 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006-2007 Matthew Jacob * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Based upon work by Pawel Jakub Dawidek for all of the * fine geom examples, and by Poul Henning Kamp for GEOM * itself, all of which is most gratefully acknowledged. */ #ifndef _G_MULTIPATH_H_ #define _G_MULTIPATH_H_ #define G_MULTIPATH_CLASS_NAME "MULTIPATH" #define G_MULTIPATH_VERSION 1 #define G_MULTIPATH_MAGIC "GEOM::MULTIPATH" #include #ifdef _KERNEL struct g_multipath_softc { struct g_provider * sc_pp; struct g_consumer * sc_active; struct mtx sc_mtx; char sc_name[16]; char sc_uuid[40]; off_t sc_size; int sc_opened; int sc_stopping; int sc_ndisks; int sc_active_active; /* Active/Active mode */ }; #endif /* _KERNEL */ struct g_multipath_metadata { char md_magic[16]; /* Magic Value */ char md_uuid[40]; /* more magic */ char md_name[16]; /* a friendly name */ uint32_t md_version; /* version */ uint32_t md_sectorsize; /* sectorsize of provider */ uint64_t md_size; /* absolute size of provider */ uint8_t md_active_active; /* Active/Active mode */ }; static __inline void multipath_metadata_encode(const struct g_multipath_metadata *, u_char *); static __inline void multipath_metadata_decode(u_char *, struct g_multipath_metadata *); static __inline void multipath_metadata_encode(const struct g_multipath_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); data += sizeof(md->md_magic); bcopy(md->md_uuid, data, sizeof(md->md_uuid)); data += sizeof(md->md_uuid); bcopy(md->md_name, data, sizeof(md->md_name)); data += sizeof(md->md_name); le32enc(data, md->md_version); data += sizeof(md->md_version); le32enc(data, md->md_sectorsize); data += sizeof(md->md_sectorsize); le64enc(data, md->md_size); data += sizeof(md->md_size); *data = md->md_active_active; } static __inline void multipath_metadata_decode(u_char *data, struct g_multipath_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); data += sizeof(md->md_magic); bcopy(data, md->md_uuid, sizeof(md->md_uuid)); data += sizeof(md->md_uuid); bcopy(data, md->md_name, sizeof(md->md_name)); data += sizeof(md->md_name); md->md_version = le32dec(data); data += sizeof(md->md_version); md->md_sectorsize = le32dec(data); data += sizeof(md->md_sectorsize); md->md_size = le64dec(data); data += sizeof(md->md_size); md->md_active_active = *data; } #endif /* _G_MULTIPATH_H_ */ Index: head/sys/geom/nop/g_nop.c =================================================================== --- head/sys/geom/nop/g_nop.c (revision 326269) +++ head/sys/geom/nop/g_nop.c (revision 326270) @@ -1,704 +1,706 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, nop, CTLFLAG_RW, 0, "GEOM_NOP stuff"); static u_int g_nop_debug = 0; SYSCTL_UINT(_kern_geom_nop, OID_AUTO, debug, CTLFLAG_RW, &g_nop_debug, 0, "Debug level"); static int g_nop_destroy(struct g_geom *gp, boolean_t force); static int g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_nop_config(struct gctl_req *req, struct g_class *mp, const char *verb); static void g_nop_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); struct g_class g_nop_class = { .name = G_NOP_CLASS_NAME, .version = G_VERSION, .ctlreq = g_nop_config, .destroy_geom = g_nop_destroy_geom }; static void g_nop_orphan(struct g_consumer *cp) { g_topology_assert(); g_nop_destroy(cp->geom, 1); } static void g_nop_resize(struct g_consumer *cp) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *pp; off_t size; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc->sc_explicitsize != 0) return; if (cp->provider->mediasize < sc->sc_offset) { g_nop_destroy(gp, 1); return; } size = cp->provider->mediasize - sc->sc_offset; LIST_FOREACH(pp, &gp->provider, provider) g_resize_provider(pp, size); } static void g_nop_start(struct bio *bp) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *cbp; u_int failprob = 0; gp = bp->bio_to->geom; sc = gp->softc; G_NOP_LOGREQ(bp, "Request received."); mtx_lock(&sc->sc_lock); switch (bp->bio_cmd) { case BIO_READ: sc->sc_reads++; sc->sc_readbytes += bp->bio_length; failprob = sc->sc_rfailprob; break; case BIO_WRITE: sc->sc_writes++; sc->sc_wrotebytes += bp->bio_length; failprob = sc->sc_wfailprob; break; case BIO_DELETE: sc->sc_deletes++; break; case BIO_GETATTR: sc->sc_getattrs++; break; case BIO_FLUSH: sc->sc_flushes++; break; case BIO_CMD0: sc->sc_cmd0s++; break; case BIO_CMD1: sc->sc_cmd1s++; break; case BIO_CMD2: sc->sc_cmd2s++; break; } mtx_unlock(&sc->sc_lock); if (failprob > 0) { u_int rval; rval = arc4random() % 100; if (rval < failprob) { G_NOP_LOGREQLVL(1, bp, "Returning error=%d.", sc->sc_error); g_io_deliver(bp, sc->sc_error); return; } } cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; cbp->bio_offset = bp->bio_offset + sc->sc_offset; pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("NULL pp")); cbp->bio_to = pp; G_NOP_LOGREQ(cbp, "Sending request."); g_io_request(cbp, LIST_FIRST(&gp->consumer)); } static int g_nop_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static int g_nop_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, int ioerror, u_int rfailprob, u_int wfailprob, off_t offset, off_t size, u_int secsize, u_int stripesize, u_int stripeoffset) { struct g_nop_softc *sc; struct g_geom *gp; struct g_provider *newpp; struct g_consumer *cp; char name[64]; int error; off_t explicitsize; g_topology_assert(); gp = NULL; newpp = NULL; cp = NULL; if ((offset % pp->sectorsize) != 0) { gctl_error(req, "Invalid offset for provider %s.", pp->name); return (EINVAL); } if ((size % pp->sectorsize) != 0) { gctl_error(req, "Invalid size for provider %s.", pp->name); return (EINVAL); } if (offset >= pp->mediasize) { gctl_error(req, "Invalid offset for provider %s.", pp->name); return (EINVAL); } explicitsize = size; if (size == 0) size = pp->mediasize - offset; if (offset + size > pp->mediasize) { gctl_error(req, "Invalid size for provider %s.", pp->name); return (EINVAL); } if (secsize == 0) secsize = pp->sectorsize; else if ((secsize % pp->sectorsize) != 0) { gctl_error(req, "Invalid secsize for provider %s.", pp->name); return (EINVAL); } if (secsize > MAXPHYS) { gctl_error(req, "secsize is too big."); return (EINVAL); } size -= size % secsize; if ((stripesize % pp->sectorsize) != 0) { gctl_error(req, "Invalid stripesize for provider %s.", pp->name); return (EINVAL); } if ((stripeoffset % pp->sectorsize) != 0) { gctl_error(req, "Invalid stripeoffset for provider %s.", pp->name); return (EINVAL); } if (stripesize != 0 && stripeoffset >= stripesize) { gctl_error(req, "stripeoffset is too big."); return (EINVAL); } snprintf(name, sizeof(name), "%s%s", pp->name, G_NOP_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Provider %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_offset = offset; sc->sc_explicitsize = explicitsize; sc->sc_stripesize = stripesize; sc->sc_stripeoffset = stripeoffset; sc->sc_error = ioerror; sc->sc_rfailprob = rfailprob; sc->sc_wfailprob = wfailprob; sc->sc_reads = 0; sc->sc_writes = 0; sc->sc_deletes = 0; sc->sc_getattrs = 0; sc->sc_flushes = 0; sc->sc_cmd0s = 0; sc->sc_cmd1s = 0; sc->sc_cmd2s = 0; sc->sc_readbytes = 0; sc->sc_wrotebytes = 0; mtx_init(&sc->sc_lock, "gnop lock", NULL, MTX_DEF); gp->softc = sc; gp->start = g_nop_start; gp->orphan = g_nop_orphan; gp->resize = g_nop_resize; gp->access = g_nop_access; gp->dumpconf = g_nop_dumpconf; newpp = g_new_providerf(gp, "%s", gp->name); newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; newpp->mediasize = size; newpp->sectorsize = secsize; newpp->stripesize = stripesize; newpp->stripeoffset = stripeoffset; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } newpp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(newpp, 0); G_NOP_DEBUG(0, "Device %s created.", gp->name); return (0); fail: if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_provider(newpp); mtx_destroy(&sc->sc_lock); g_free(gp->softc); g_destroy_geom(gp); return (error); } static int g_nop_destroy(struct g_geom *gp, boolean_t force) { struct g_nop_softc *sc; struct g_provider *pp; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); pp = LIST_FIRST(&gp->provider); if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_NOP_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_NOP_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } else { G_NOP_DEBUG(0, "Device %s removed.", gp->name); } gp->softc = NULL; mtx_destroy(&sc->sc_lock); g_free(sc); g_wither_geom(gp, ENXIO); return (0); } static int g_nop_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_nop_destroy(gp, 0)); } static void g_nop_ctl_create(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; intmax_t *error, *rfailprob, *wfailprob, *offset, *secsize, *size, *stripesize, *stripeoffset; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } error = gctl_get_paraml(req, "error", sizeof(*error)); if (error == NULL) { gctl_error(req, "No '%s' argument", "error"); return; } rfailprob = gctl_get_paraml(req, "rfailprob", sizeof(*rfailprob)); if (rfailprob == NULL) { gctl_error(req, "No '%s' argument", "rfailprob"); return; } if (*rfailprob < -1 || *rfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "rfailprob"); return; } wfailprob = gctl_get_paraml(req, "wfailprob", sizeof(*wfailprob)); if (wfailprob == NULL) { gctl_error(req, "No '%s' argument", "wfailprob"); return; } if (*wfailprob < -1 || *wfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "wfailprob"); return; } offset = gctl_get_paraml(req, "offset", sizeof(*offset)); if (offset == NULL) { gctl_error(req, "No '%s' argument", "offset"); return; } if (*offset < 0) { gctl_error(req, "Invalid '%s' argument", "offset"); return; } size = gctl_get_paraml(req, "size", sizeof(*size)); if (size == NULL) { gctl_error(req, "No '%s' argument", "size"); return; } if (*size < 0) { gctl_error(req, "Invalid '%s' argument", "size"); return; } secsize = gctl_get_paraml(req, "secsize", sizeof(*secsize)); if (secsize == NULL) { gctl_error(req, "No '%s' argument", "secsize"); return; } if (*secsize < 0) { gctl_error(req, "Invalid '%s' argument", "secsize"); return; } stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "No '%s' argument", "stripesize"); return; } if (*stripesize < 0) { gctl_error(req, "Invalid '%s' argument", "stripesize"); return; } stripeoffset = gctl_get_paraml(req, "stripeoffset", sizeof(*stripeoffset)); if (stripeoffset == NULL) { gctl_error(req, "No '%s' argument", "stripeoffset"); return; } if (*stripeoffset < 0) { gctl_error(req, "Invalid '%s' argument", "stripeoffset"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_NOP_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } if (g_nop_create(req, mp, pp, *error == -1 ? EIO : (int)*error, *rfailprob == -1 ? 0 : (u_int)*rfailprob, *wfailprob == -1 ? 0 : (u_int)*wfailprob, (off_t)*offset, (off_t)*size, (u_int)*secsize, (u_int)*stripesize, (u_int)*stripeoffset) != 0) { return; } } } static void g_nop_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_nop_softc *sc; struct g_provider *pp; intmax_t *error, *rfailprob, *wfailprob; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } error = gctl_get_paraml(req, "error", sizeof(*error)); if (error == NULL) { gctl_error(req, "No '%s' argument", "error"); return; } rfailprob = gctl_get_paraml(req, "rfailprob", sizeof(*rfailprob)); if (rfailprob == NULL) { gctl_error(req, "No '%s' argument", "rfailprob"); return; } if (*rfailprob < -1 || *rfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "rfailprob"); return; } wfailprob = gctl_get_paraml(req, "wfailprob", sizeof(*wfailprob)); if (wfailprob == NULL) { gctl_error(req, "No '%s' argument", "wfailprob"); return; } if (*wfailprob < -1 || *wfailprob > 100) { gctl_error(req, "Invalid '%s' argument", "wfailprob"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_NOP_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } sc = pp->geom->softc; if (*error != -1) sc->sc_error = (int)*error; if (*rfailprob != -1) sc->sc_rfailprob = (u_int)*rfailprob; if (*wfailprob != -1) sc->sc_wfailprob = (u_int)*wfailprob; } } static struct g_geom * g_nop_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_nop_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int *nargs, *force, error, i; struct g_geom *gp; const char *name; char param[16]; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); gp = g_nop_find_geom(mp, name); if (gp == NULL) { G_NOP_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); return; } error = g_nop_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); return; } } } static void g_nop_ctl_reset(struct gctl_req *req, struct g_class *mp) { struct g_nop_softc *sc; struct g_provider *pp; const char *name; char param[16]; int i, *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_NOP_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); return; } sc = pp->geom->softc; sc->sc_reads = 0; sc->sc_writes = 0; sc->sc_deletes = 0; sc->sc_getattrs = 0; sc->sc_flushes = 0; sc->sc_cmd0s = 0; sc->sc_cmd1s = 0; sc->sc_cmd2s = 0; sc->sc_readbytes = 0; sc->sc_wrotebytes = 0; } } static void g_nop_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_NOP_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_nop_ctl_create(req, mp); return; } else if (strcmp(verb, "configure") == 0) { g_nop_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_nop_ctl_destroy(req, mp); return; } else if (strcmp(verb, "reset") == 0) { g_nop_ctl_reset(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_nop_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_nop_softc *sc; if (pp != NULL || cp != NULL) return; sc = gp->softc; sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->sc_offset); sbuf_printf(sb, "%s%u\n", indent, sc->sc_rfailprob); sbuf_printf(sb, "%s%u\n", indent, sc->sc_wfailprob); sbuf_printf(sb, "%s%d\n", indent, sc->sc_error); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_deletes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_getattrs); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_flushes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd0s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd1s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd2s); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_readbytes); sbuf_printf(sb, "%s%ju\n", indent, sc->sc_wrotebytes); } DECLARE_GEOM_CLASS(g_nop_class, g_nop); Index: head/sys/geom/nop/g_nop.h =================================================================== --- head/sys/geom/nop/g_nop.h (revision 326269) +++ head/sys/geom/nop/g_nop.h (revision 326270) @@ -1,80 +1,82 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_NOP_H_ #define _G_NOP_H_ #define G_NOP_CLASS_NAME "NOP" #define G_NOP_VERSION 4 #define G_NOP_SUFFIX ".nop" #ifdef _KERNEL #define G_NOP_DEBUG(lvl, ...) do { \ if (g_nop_debug >= (lvl)) { \ printf("GEOM_NOP"); \ if (g_nop_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_NOP_LOGREQ(bp, ...) G_NOP_LOGREQLVL(2, bp, __VA_ARGS__) #define G_NOP_LOGREQLVL(lvl, bp, ...) do { \ if (g_nop_debug >= (lvl)) { \ printf("GEOM_NOP[%d]: ", (lvl)); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) struct g_nop_softc { int sc_error; off_t sc_offset; off_t sc_explicitsize; off_t sc_stripesize; off_t sc_stripeoffset; u_int sc_rfailprob; u_int sc_wfailprob; uintmax_t sc_reads; uintmax_t sc_writes; uintmax_t sc_deletes; uintmax_t sc_getattrs; uintmax_t sc_flushes; uintmax_t sc_cmd0s; uintmax_t sc_cmd1s; uintmax_t sc_cmd2s; uintmax_t sc_readbytes; uintmax_t sc_wrotebytes; struct mtx sc_lock; }; #endif /* _KERNEL */ #endif /* _G_NOP_H_ */ Index: head/sys/geom/part/g_part.c =================================================================== --- head/sys/geom/part/g_part.c (revision 326269) +++ head/sys/geom/part/g_part.c (revision 326270) @@ -1,2371 +1,2373 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002, 2005-2009 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif static kobj_method_t g_part_null_methods[] = { { 0, 0 } }; static struct g_part_scheme g_part_null_scheme = { "(none)", g_part_null_methods, sizeof(struct g_part_table), }; TAILQ_HEAD(, g_part_scheme) g_part_schemes = TAILQ_HEAD_INITIALIZER(g_part_schemes); struct g_part_alias_list { const char *lexeme; enum g_part_alias alias; } g_part_alias_list[G_PART_ALIAS_COUNT] = { { "apple-boot", G_PART_ALIAS_APPLE_BOOT }, { "apple-core-storage", G_PART_ALIAS_APPLE_CORE_STORAGE }, { "apple-hfs", G_PART_ALIAS_APPLE_HFS }, { "apple-label", G_PART_ALIAS_APPLE_LABEL }, { "apple-raid", G_PART_ALIAS_APPLE_RAID }, { "apple-raid-offline", G_PART_ALIAS_APPLE_RAID_OFFLINE }, { "apple-tv-recovery", G_PART_ALIAS_APPLE_TV_RECOVERY }, { "apple-ufs", G_PART_ALIAS_APPLE_UFS }, { "bios-boot", G_PART_ALIAS_BIOS_BOOT }, { "chromeos-firmware", G_PART_ALIAS_CHROMEOS_FIRMWARE }, { "chromeos-kernel", G_PART_ALIAS_CHROMEOS_KERNEL }, { "chromeos-reserved", G_PART_ALIAS_CHROMEOS_RESERVED }, { "chromeos-root", G_PART_ALIAS_CHROMEOS_ROOT }, { "dragonfly-ccd", G_PART_ALIAS_DFBSD_CCD }, { "dragonfly-hammer", G_PART_ALIAS_DFBSD_HAMMER }, { "dragonfly-hammer2", G_PART_ALIAS_DFBSD_HAMMER2 }, { "dragonfly-label32", G_PART_ALIAS_DFBSD }, { "dragonfly-label64", G_PART_ALIAS_DFBSD64 }, { "dragonfly-legacy", G_PART_ALIAS_DFBSD_LEGACY }, { "dragonfly-swap", G_PART_ALIAS_DFBSD_SWAP }, { "dragonfly-ufs", G_PART_ALIAS_DFBSD_UFS }, { "dragonfly-vinum", G_PART_ALIAS_DFBSD_VINUM }, { "ebr", G_PART_ALIAS_EBR }, { "efi", G_PART_ALIAS_EFI }, { "fat16", G_PART_ALIAS_MS_FAT16 }, { "fat32", G_PART_ALIAS_MS_FAT32 }, { "freebsd", G_PART_ALIAS_FREEBSD }, { "freebsd-boot", G_PART_ALIAS_FREEBSD_BOOT }, { "freebsd-nandfs", G_PART_ALIAS_FREEBSD_NANDFS }, { "freebsd-swap", G_PART_ALIAS_FREEBSD_SWAP }, { "freebsd-ufs", G_PART_ALIAS_FREEBSD_UFS }, { "freebsd-vinum", G_PART_ALIAS_FREEBSD_VINUM }, { "freebsd-zfs", G_PART_ALIAS_FREEBSD_ZFS }, { "linux-data", G_PART_ALIAS_LINUX_DATA }, { "linux-lvm", G_PART_ALIAS_LINUX_LVM }, { "linux-raid", G_PART_ALIAS_LINUX_RAID }, { "linux-swap", G_PART_ALIAS_LINUX_SWAP }, { "mbr", G_PART_ALIAS_MBR }, { "ms-basic-data", G_PART_ALIAS_MS_BASIC_DATA }, { "ms-ldm-data", G_PART_ALIAS_MS_LDM_DATA }, { "ms-ldm-metadata", G_PART_ALIAS_MS_LDM_METADATA }, { "ms-recovery", G_PART_ALIAS_MS_RECOVERY }, { "ms-reserved", G_PART_ALIAS_MS_RESERVED }, { "ms-spaces", G_PART_ALIAS_MS_SPACES }, { "netbsd-ccd", G_PART_ALIAS_NETBSD_CCD }, { "netbsd-cgd", G_PART_ALIAS_NETBSD_CGD }, { "netbsd-ffs", G_PART_ALIAS_NETBSD_FFS }, { "netbsd-lfs", G_PART_ALIAS_NETBSD_LFS }, { "netbsd-raid", G_PART_ALIAS_NETBSD_RAID }, { "netbsd-swap", G_PART_ALIAS_NETBSD_SWAP }, { "ntfs", G_PART_ALIAS_MS_NTFS }, { "openbsd-data", G_PART_ALIAS_OPENBSD_DATA }, { "prep-boot", G_PART_ALIAS_PREP_BOOT }, { "vmware-reserved", G_PART_ALIAS_VMRESERVED }, { "vmware-vmfs", G_PART_ALIAS_VMFS }, { "vmware-vmkdiag", G_PART_ALIAS_VMKDIAG }, { "vmware-vsanhdr", G_PART_ALIAS_VMVSANHDR }, }; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, part, CTLFLAG_RW, 0, "GEOM_PART stuff"); static u_int check_integrity = 1; SYSCTL_UINT(_kern_geom_part, OID_AUTO, check_integrity, CTLFLAG_RWTUN, &check_integrity, 1, "Enable integrity checking"); static u_int auto_resize = 1; SYSCTL_UINT(_kern_geom_part, OID_AUTO, auto_resize, CTLFLAG_RWTUN, &auto_resize, 1, "Enable auto resize"); /* * The GEOM partitioning class. */ static g_ctl_req_t g_part_ctlreq; static g_ctl_destroy_geom_t g_part_destroy_geom; static g_fini_t g_part_fini; static g_init_t g_part_init; static g_taste_t g_part_taste; static g_access_t g_part_access; static g_dumpconf_t g_part_dumpconf; static g_orphan_t g_part_orphan; static g_spoiled_t g_part_spoiled; static g_start_t g_part_start; static g_resize_t g_part_resize; static g_ioctl_t g_part_ioctl; static struct g_class g_part_class = { .name = "PART", .version = G_VERSION, /* Class methods. */ .ctlreq = g_part_ctlreq, .destroy_geom = g_part_destroy_geom, .fini = g_part_fini, .init = g_part_init, .taste = g_part_taste, /* Geom methods. */ .access = g_part_access, .dumpconf = g_part_dumpconf, .orphan = g_part_orphan, .spoiled = g_part_spoiled, .start = g_part_start, .resize = g_part_resize, .ioctl = g_part_ioctl, }; DECLARE_GEOM_CLASS(g_part_class, g_part); MODULE_VERSION(g_part, 0); /* * Support functions. */ static void g_part_wither(struct g_geom *, int); const char * g_part_alias_name(enum g_part_alias alias) { int i; for (i = 0; i < G_PART_ALIAS_COUNT; i++) { if (g_part_alias_list[i].alias != alias) continue; return (g_part_alias_list[i].lexeme); } return (NULL); } void g_part_geometry_heads(off_t blocks, u_int sectors, off_t *bestchs, u_int *bestheads) { static u_int candidate_heads[] = { 1, 2, 16, 32, 64, 128, 255, 0 }; off_t chs, cylinders; u_int heads; int idx; *bestchs = 0; *bestheads = 0; for (idx = 0; candidate_heads[idx] != 0; idx++) { heads = candidate_heads[idx]; cylinders = blocks / heads / sectors; if (cylinders < heads || cylinders < sectors) break; if (cylinders > 1023) continue; chs = cylinders * heads * sectors; if (chs > *bestchs || (chs == *bestchs && *bestheads == 1)) { *bestchs = chs; *bestheads = heads; } } } static void g_part_geometry(struct g_part_table *table, struct g_consumer *cp, off_t blocks) { static u_int candidate_sectors[] = { 1, 9, 17, 33, 63, 0 }; off_t chs, bestchs; u_int heads, sectors; int idx; if (g_getattr("GEOM::fwsectors", cp, §ors) != 0 || sectors == 0 || g_getattr("GEOM::fwheads", cp, &heads) != 0 || heads == 0) { table->gpt_fixgeom = 0; table->gpt_heads = 0; table->gpt_sectors = 0; bestchs = 0; for (idx = 0; candidate_sectors[idx] != 0; idx++) { sectors = candidate_sectors[idx]; g_part_geometry_heads(blocks, sectors, &chs, &heads); if (chs == 0) continue; /* * Prefer a geometry with sectors > 1, but only if * it doesn't bump down the number of heads to 1. */ if (chs > bestchs || (chs == bestchs && heads > 1 && table->gpt_sectors == 1)) { bestchs = chs; table->gpt_heads = heads; table->gpt_sectors = sectors; } } /* * If we didn't find a geometry at all, then the disk is * too big. This means we can use the maximum number of * heads and sectors. */ if (bestchs == 0) { table->gpt_heads = 255; table->gpt_sectors = 63; } } else { table->gpt_fixgeom = 1; table->gpt_heads = heads; table->gpt_sectors = sectors; } } #define DPRINTF(...) if (bootverbose) { \ printf("GEOM_PART: " __VA_ARGS__); \ } static int g_part_check_integrity(struct g_part_table *table, struct g_consumer *cp) { struct g_part_entry *e1, *e2; struct g_provider *pp; off_t offset; int failed; failed = 0; pp = cp->provider; if (table->gpt_last < table->gpt_first) { DPRINTF("last LBA is below first LBA: %jd < %jd\n", (intmax_t)table->gpt_last, (intmax_t)table->gpt_first); failed++; } if (table->gpt_last > pp->mediasize / pp->sectorsize - 1) { DPRINTF("last LBA extends beyond mediasize: " "%jd > %jd\n", (intmax_t)table->gpt_last, (intmax_t)pp->mediasize / pp->sectorsize - 1); failed++; } LIST_FOREACH(e1, &table->gpt_entry, gpe_entry) { if (e1->gpe_deleted || e1->gpe_internal) continue; if (e1->gpe_start < table->gpt_first) { DPRINTF("partition %d has start offset below first " "LBA: %jd < %jd\n", e1->gpe_index, (intmax_t)e1->gpe_start, (intmax_t)table->gpt_first); failed++; } if (e1->gpe_start > table->gpt_last) { DPRINTF("partition %d has start offset beyond last " "LBA: %jd > %jd\n", e1->gpe_index, (intmax_t)e1->gpe_start, (intmax_t)table->gpt_last); failed++; } if (e1->gpe_end < e1->gpe_start) { DPRINTF("partition %d has end offset below start " "offset: %jd < %jd\n", e1->gpe_index, (intmax_t)e1->gpe_end, (intmax_t)e1->gpe_start); failed++; } if (e1->gpe_end > table->gpt_last) { DPRINTF("partition %d has end offset beyond last " "LBA: %jd > %jd\n", e1->gpe_index, (intmax_t)e1->gpe_end, (intmax_t)table->gpt_last); failed++; } if (pp->stripesize > 0) { offset = e1->gpe_start * pp->sectorsize; if (e1->gpe_offset > offset) offset = e1->gpe_offset; if ((offset + pp->stripeoffset) % pp->stripesize) { DPRINTF("partition %d on (%s, %s) is not " "aligned on %u bytes\n", e1->gpe_index, pp->name, table->gpt_scheme->name, pp->stripesize); /* Don't treat this as a critical failure */ } } e2 = e1; while ((e2 = LIST_NEXT(e2, gpe_entry)) != NULL) { if (e2->gpe_deleted || e2->gpe_internal) continue; if (e1->gpe_start >= e2->gpe_start && e1->gpe_start <= e2->gpe_end) { DPRINTF("partition %d has start offset inside " "partition %d: start[%d] %jd >= start[%d] " "%jd <= end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e2->gpe_index, (intmax_t)e2->gpe_start, e1->gpe_index, (intmax_t)e1->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_end); failed++; } if (e1->gpe_end >= e2->gpe_start && e1->gpe_end <= e2->gpe_end) { DPRINTF("partition %d has end offset inside " "partition %d: start[%d] %jd >= end[%d] " "%jd <= end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e2->gpe_index, (intmax_t)e2->gpe_start, e1->gpe_index, (intmax_t)e1->gpe_end, e2->gpe_index, (intmax_t)e2->gpe_end); failed++; } if (e1->gpe_start < e2->gpe_start && e1->gpe_end > e2->gpe_end) { DPRINTF("partition %d contains partition %d: " "start[%d] %jd > start[%d] %jd, end[%d] " "%jd < end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e1->gpe_index, (intmax_t)e1->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_end, e1->gpe_index, (intmax_t)e1->gpe_end); failed++; } } } if (failed != 0) { printf("GEOM_PART: integrity check failed (%s, %s)\n", pp->name, table->gpt_scheme->name); if (check_integrity != 0) return (EINVAL); table->gpt_corrupt = 1; } return (0); } #undef DPRINTF struct g_part_entry * g_part_new_entry(struct g_part_table *table, int index, quad_t start, quad_t end) { struct g_part_entry *entry, *last; last = NULL; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_index == index) break; if (entry->gpe_index > index) { entry = NULL; break; } last = entry; } if (entry == NULL) { entry = g_malloc(table->gpt_scheme->gps_entrysz, M_WAITOK | M_ZERO); entry->gpe_index = index; if (last == NULL) LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry); else LIST_INSERT_AFTER(last, entry, gpe_entry); } else entry->gpe_offset = 0; entry->gpe_start = start; entry->gpe_end = end; return (entry); } static void g_part_new_provider(struct g_geom *gp, struct g_part_table *table, struct g_part_entry *entry) { struct g_consumer *cp; struct g_provider *pp; struct sbuf *sb; struct g_geom_alias *gap; off_t offset; cp = LIST_FIRST(&gp->consumer); pp = cp->provider; offset = entry->gpe_start * pp->sectorsize; if (entry->gpe_offset < offset) entry->gpe_offset = offset; if (entry->gpe_pp == NULL) { /* * Add aliases to the geom before we create the provider so that * geom_dev can taste it with all the aliases in place so all * the aliased dev_t instances get created for each partition * (eg foo5p7 gets created for bar5p7 when foo is an alias of bar). */ LIST_FOREACH(gap, &table->gpt_gp->aliases, ga_next) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gap->ga_alias); sbuf_finish(sb); g_geom_add_alias(gp, sbuf_data(sb)); sbuf_delete(sb); } sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_finish(sb); entry->gpe_pp = g_new_providerf(gp, "%s", sbuf_data(sb)); sbuf_delete(sb); entry->gpe_pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; entry->gpe_pp->private = entry; /* Close the circle. */ } entry->gpe_pp->index = entry->gpe_index - 1; /* index is 1-based. */ entry->gpe_pp->mediasize = (entry->gpe_end - entry->gpe_start + 1) * pp->sectorsize; entry->gpe_pp->mediasize -= entry->gpe_offset - offset; entry->gpe_pp->sectorsize = pp->sectorsize; entry->gpe_pp->stripesize = pp->stripesize; entry->gpe_pp->stripeoffset = pp->stripeoffset + entry->gpe_offset; if (pp->stripesize > 0) entry->gpe_pp->stripeoffset %= pp->stripesize; entry->gpe_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(entry->gpe_pp, 0); } static struct g_geom* g_part_find_geom(const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &g_part_class.geom, geom) { if ((gp->flags & G_GEOM_WITHER) == 0 && strcmp(name, gp->name) == 0) break; } return (gp); } static int g_part_parm_geom(struct gctl_req *req, const char *name, struct g_geom **v) { struct g_geom *gp; const char *gname; gname = gctl_get_asciiparam(req, name); if (gname == NULL) return (ENOATTR); if (strncmp(gname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) gname += sizeof(_PATH_DEV) - 1; gp = g_part_find_geom(gname); if (gp == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, gname); return (EINVAL); } *v = gp; return (0); } static int g_part_parm_provider(struct gctl_req *req, const char *name, struct g_provider **v) { struct g_provider *pp; const char *pname; pname = gctl_get_asciiparam(req, name); if (pname == NULL) return (ENOATTR); if (strncmp(pname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) pname += sizeof(_PATH_DEV) - 1; pp = g_provider_by_name(pname); if (pp == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, pname); return (EINVAL); } *v = pp; return (0); } static int g_part_parm_quad(struct gctl_req *req, const char *name, quad_t *v) { const char *p; char *x; quad_t q; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); q = strtoq(p, &x, 0); if (*x != '\0' || q < 0) { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = q; return (0); } static int g_part_parm_scheme(struct gctl_req *req, const char *name, struct g_part_scheme **v) { struct g_part_scheme *s; const char *p; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); TAILQ_FOREACH(s, &g_part_schemes, scheme_list) { if (s == &g_part_null_scheme) continue; if (!strcasecmp(s->name, p)) break; } if (s == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = s; return (0); } static int g_part_parm_str(struct gctl_req *req, const char *name, const char **v) { const char *p; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); /* An empty label is always valid. */ if (strcmp(name, "label") != 0 && p[0] == '\0') { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = p; return (0); } static int g_part_parm_intmax(struct gctl_req *req, const char *name, u_int *v) { const intmax_t *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); if (size != sizeof(*p) || *p < 0 || *p > INT_MAX) { gctl_error(req, "%d %s '%jd'", EINVAL, name, *p); return (EINVAL); } *v = (u_int)*p; return (0); } static int g_part_parm_uint32(struct gctl_req *req, const char *name, u_int *v) { const uint32_t *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); if (size != sizeof(*p) || *p > INT_MAX) { gctl_error(req, "%d %s '%u'", EINVAL, name, (unsigned int)*p); return (EINVAL); } *v = (u_int)*p; return (0); } static int g_part_parm_bootcode(struct gctl_req *req, const char *name, const void **v, unsigned int *s) { const void *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); *v = p; *s = size; return (0); } static int g_part_probe(struct g_geom *gp, struct g_consumer *cp, int depth) { struct g_part_scheme *iter, *scheme; struct g_part_table *table; int pri, probe; table = gp->softc; scheme = (table != NULL) ? table->gpt_scheme : NULL; pri = (scheme != NULL) ? G_PART_PROBE(table, cp) : INT_MIN; if (pri == 0) goto done; if (pri > 0) { /* error */ scheme = NULL; pri = INT_MIN; } TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) { if (iter == &g_part_null_scheme) continue; table = (void *)kobj_create((kobj_class_t)iter, M_GEOM, M_WAITOK); table->gpt_gp = gp; table->gpt_scheme = iter; table->gpt_depth = depth; probe = G_PART_PROBE(table, cp); if (probe <= 0 && probe > pri) { pri = probe; scheme = iter; if (gp->softc != NULL) kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = table; if (pri == 0) goto done; } else kobj_delete((kobj_t)table, M_GEOM); } done: return ((scheme == NULL) ? ENXIO : 0); } /* * Control request functions. */ static int g_part_ctl_add(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *delent, *last, *entry; struct g_part_table *table; struct sbuf *sb; quad_t end; unsigned int index; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); pp = LIST_FIRST(&gp->consumer)->provider; table = gp->softc; end = gpp->gpp_start + gpp->gpp_size - 1; if (gpp->gpp_start < table->gpt_first || gpp->gpp_start > table->gpt_last) { gctl_error(req, "%d start '%jd'", EINVAL, (intmax_t)gpp->gpp_start); return (EINVAL); } if (end < gpp->gpp_start || end > table->gpt_last) { gctl_error(req, "%d size '%jd'", EINVAL, (intmax_t)gpp->gpp_size); return (EINVAL); } if (gpp->gpp_index > table->gpt_entries) { gctl_error(req, "%d index '%d'", EINVAL, gpp->gpp_index); return (EINVAL); } delent = last = NULL; index = (gpp->gpp_index > 0) ? gpp->gpp_index : 1; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted) { if (entry->gpe_index == index) delent = entry; continue; } if (entry->gpe_index == index) index = entry->gpe_index + 1; if (entry->gpe_index < index) last = entry; if (entry->gpe_internal) continue; if (gpp->gpp_start >= entry->gpe_start && gpp->gpp_start <= entry->gpe_end) { gctl_error(req, "%d start '%jd'", ENOSPC, (intmax_t)gpp->gpp_start); return (ENOSPC); } if (end >= entry->gpe_start && end <= entry->gpe_end) { gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end); return (ENOSPC); } if (gpp->gpp_start < entry->gpe_start && end > entry->gpe_end) { gctl_error(req, "%d size '%jd'", ENOSPC, (intmax_t)gpp->gpp_size); return (ENOSPC); } } if (gpp->gpp_index > 0 && index != gpp->gpp_index) { gctl_error(req, "%d index '%d'", EEXIST, gpp->gpp_index); return (EEXIST); } if (index > table->gpt_entries) { gctl_error(req, "%d index '%d'", ENOSPC, index); return (ENOSPC); } entry = (delent == NULL) ? g_malloc(table->gpt_scheme->gps_entrysz, M_WAITOK | M_ZERO) : delent; entry->gpe_index = index; entry->gpe_start = gpp->gpp_start; entry->gpe_end = end; error = G_PART_ADD(table, entry, gpp); if (error) { gctl_error(req, "%d", error); if (delent == NULL) g_free(entry); return (error); } if (delent == NULL) { if (last == NULL) LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry); else LIST_INSERT_AFTER(last, entry, gpe_entry); entry->gpe_created = 1; } else { entry->gpe_deleted = 0; entry->gpe_modified = 1; } g_part_new_provider(gp, table, entry); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); if (pp->stripesize > 0 && entry->gpe_pp->stripeoffset != 0) sbuf_printf(sb, " added, but partition is not " "aligned on %u bytes\n", pp->stripesize); else sbuf_cat(sb, " added\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_bootcode(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_part_table *table; struct sbuf *sb; int error, sz; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; sz = table->gpt_scheme->gps_bootcodesz; if (sz == 0) { error = ENODEV; goto fail; } if (gpp->gpp_codesize > sz) { error = EFBIG; goto fail; } error = G_PART_BOOTCODE(table, gpp); if (error) goto fail; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "bootcode written to %s\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); fail: gctl_error(req, "%d", error); return (error); } static int g_part_ctl_commit(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry, *tmp; struct g_part_table *table; char *buf; int error, i; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (!table->gpt_opened) { gctl_error(req, "%d", EPERM); return (EPERM); } g_topology_unlock(); cp = LIST_FIRST(&gp->consumer); if ((table->gpt_smhead | table->gpt_smtail) != 0) { pp = cp->provider; buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); while (table->gpt_smhead != 0) { i = ffs(table->gpt_smhead) - 1; error = g_write_data(cp, i * pp->sectorsize, buf, pp->sectorsize); if (error) { g_free(buf); goto fail; } table->gpt_smhead &= ~(1 << i); } while (table->gpt_smtail != 0) { i = ffs(table->gpt_smtail) - 1; error = g_write_data(cp, pp->mediasize - (i + 1) * pp->sectorsize, buf, pp->sectorsize); if (error) { g_free(buf); goto fail; } table->gpt_smtail &= ~(1 << i); } g_free(buf); } if (table->gpt_scheme == &g_part_null_scheme) { g_topology_lock(); g_access(cp, -1, -1, -1); g_part_wither(gp, ENXIO); return (0); } error = G_PART_WRITE(table, cp); if (error) goto fail; LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { if (!entry->gpe_deleted) { /* Notify consumers that provider might be changed. */ if (entry->gpe_modified && ( entry->gpe_pp->acw + entry->gpe_pp->ace + entry->gpe_pp->acr) == 0) g_media_changed(entry->gpe_pp, M_NOWAIT); entry->gpe_created = 0; entry->gpe_modified = 0; continue; } LIST_REMOVE(entry, gpe_entry); g_free(entry); } table->gpt_created = 0; table->gpt_opened = 0; g_topology_lock(); g_access(cp, -1, -1, -1); return (0); fail: g_topology_lock(); gctl_error(req, "%d", error); return (error); } static int g_part_ctl_create(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_scheme *scheme; struct g_part_table *null, *table; struct sbuf *sb; int attr, error; pp = gpp->gpp_provider; scheme = gpp->gpp_scheme; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name)); g_topology_assert(); /* Check that there isn't already a g_part geom on the provider. */ gp = g_part_find_geom(pp->name); if (gp != NULL) { null = gp->softc; if (null->gpt_scheme != &g_part_null_scheme) { gctl_error(req, "%d geom '%s'", EEXIST, pp->name); return (EEXIST); } } else null = NULL; if ((gpp->gpp_parms & G_PART_PARM_ENTRIES) && (gpp->gpp_entries < scheme->gps_minent || gpp->gpp_entries > scheme->gps_maxent)) { gctl_error(req, "%d entries '%d'", EINVAL, gpp->gpp_entries); return (EINVAL); } if (null == NULL) gp = g_new_geomf(&g_part_class, "%s", pp->name); gp->softc = kobj_create((kobj_class_t)gpp->gpp_scheme, M_GEOM, M_WAITOK); table = gp->softc; table->gpt_gp = gp; table->gpt_scheme = gpp->gpp_scheme; table->gpt_entries = (gpp->gpp_parms & G_PART_PARM_ENTRIES) ? gpp->gpp_entries : scheme->gps_minent; LIST_INIT(&table->gpt_entry); if (null == NULL) { cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 1, 1); if (error != 0) { g_part_wither(gp, error); gctl_error(req, "%d geom '%s'", error, pp->name); return (error); } table->gpt_opened = 1; } else { cp = LIST_FIRST(&gp->consumer); table->gpt_opened = null->gpt_opened; table->gpt_smhead = null->gpt_smhead; table->gpt_smtail = null->gpt_smtail; } g_topology_unlock(); /* Make sure the provider has media. */ if (pp->mediasize == 0 || pp->sectorsize == 0) { error = ENODEV; goto fail; } /* Make sure we can nest and if so, determine our depth. */ error = g_getattr("PART::isleaf", cp, &attr); if (!error && attr) { error = ENODEV; goto fail; } error = g_getattr("PART::depth", cp, &attr); table->gpt_depth = (!error) ? attr + 1 : 0; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); error = G_PART_CREATE(table, gpp); if (error) goto fail; g_topology_lock(); table->gpt_created = 1; if (null != NULL) kobj_delete((kobj_t)null, M_GEOM); /* * Support automatic commit by filling in the gpp_geom * parameter. */ gpp->gpp_parms |= G_PART_PARM_GEOM; gpp->gpp_geom = gp; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s created\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); fail: g_topology_lock(); if (null == NULL) { g_access(cp, -1, -1, -1); g_part_wither(gp, error); } else { kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = null; } gctl_error(req, "%d provider", error); return (error); } static int g_part_ctl_delete(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } pp = entry->gpe_pp; if (pp != NULL) { if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) { gctl_error(req, "%d", EBUSY); return (EBUSY); } pp->private = NULL; entry->gpe_pp = NULL; } if (pp != NULL) g_wither_provider(pp, ENXIO); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " deleted\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } if (entry->gpe_created) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } else { entry->gpe_modified = 0; entry->gpe_deleted = 1; } return (0); } static int g_part_ctl_destroy(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry, *tmp; struct g_part_table *null, *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; /* Check for busy providers. */ LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (gpp->gpp_force) { pp = entry->gpe_pp; if (pp == NULL) continue; if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) continue; } gctl_error(req, "%d", EBUSY); return (EBUSY); } if (gpp->gpp_force) { /* Destroy all providers. */ LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { pp = entry->gpe_pp; if (pp != NULL) { pp->private = NULL; g_wither_provider(pp, ENXIO); } LIST_REMOVE(entry, gpe_entry); g_free(entry); } } error = G_PART_DESTROY(table, gpp); if (error) { gctl_error(req, "%d", error); return (error); } gp->softc = kobj_create((kobj_class_t)&g_part_null_scheme, M_GEOM, M_WAITOK); null = gp->softc; null->gpt_gp = gp; null->gpt_scheme = &g_part_null_scheme; LIST_INIT(&null->gpt_entry); cp = LIST_FIRST(&gp->consumer); pp = cp->provider; null->gpt_last = pp->mediasize / pp->sectorsize - 1; null->gpt_depth = table->gpt_depth; null->gpt_opened = table->gpt_opened; null->gpt_smhead = table->gpt_smhead; null->gpt_smtail = table->gpt_smtail; while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } kobj_delete((kobj_t)table, M_GEOM); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s destroyed\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_modify(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } error = G_PART_MODIFY(table, entry, gpp); if (error) { gctl_error(req, "%d", error); return (error); } if (!entry->gpe_created) entry->gpe_modified = 1; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " modified\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_move(struct gctl_req *req, struct g_part_parms *gpp) { gctl_error(req, "%d verb 'move'", ENOSYS); return (ENOSYS); } static int g_part_ctl_recover(struct gctl_req *req, struct g_part_parms *gpp) { struct g_part_table *table; struct g_geom *gp; struct sbuf *sb; int error, recovered; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; error = recovered = 0; if (table->gpt_corrupt) { error = G_PART_RECOVER(table); if (error == 0) error = g_part_check_integrity(table, LIST_FIRST(&gp->consumer)); if (error) { gctl_error(req, "%d recovering '%s' failed", error, gp->name); return (error); } recovered = 1; } /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); if (recovered) sbuf_printf(sb, "%s recovered\n", gp->name); else sbuf_printf(sb, "%s recovering is not needed\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_resize(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *pe, *entry; struct g_part_table *table; struct sbuf *sb; quad_t end; int error; off_t mediasize; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; /* check gpp_index */ LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } /* check gpp_size */ end = entry->gpe_start + gpp->gpp_size - 1; if (gpp->gpp_size < 1 || end > table->gpt_last) { gctl_error(req, "%d size '%jd'", EINVAL, (intmax_t)gpp->gpp_size); return (EINVAL); } LIST_FOREACH(pe, &table->gpt_entry, gpe_entry) { if (pe->gpe_deleted || pe->gpe_internal || pe == entry) continue; if (end >= pe->gpe_start && end <= pe->gpe_end) { gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end); return (ENOSPC); } if (entry->gpe_start < pe->gpe_start && end > pe->gpe_end) { gctl_error(req, "%d size '%jd'", ENOSPC, (intmax_t)gpp->gpp_size); return (ENOSPC); } } pp = entry->gpe_pp; if ((g_debugflags & 16) == 0 && (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)) { if (entry->gpe_end - entry->gpe_start + 1 > gpp->gpp_size) { /* Deny shrinking of an opened partition. */ gctl_error(req, "%d", EBUSY); return (EBUSY); } } error = G_PART_RESIZE(table, entry, gpp); if (error) { gctl_error(req, "%d%s", error, error != EBUSY ? "": " resizing will lead to unexpected shrinking" " due to alignment"); return (error); } if (!entry->gpe_created) entry->gpe_modified = 1; /* update mediasize of changed provider */ mediasize = (entry->gpe_end - entry->gpe_start + 1) * pp->sectorsize; g_resize_provider(pp, mediasize); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " resized\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_setunset(struct gctl_req *req, struct g_part_parms *gpp, unsigned int set) { struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (gpp->gpp_parms & G_PART_PARM_INDEX) { LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } } else entry = NULL; error = G_PART_SETUNSET(table, entry, gpp->gpp_attrib, set); if (error) { gctl_error(req, "%d attrib '%s'", error, gpp->gpp_attrib); return (error); } /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s %sset on ", gpp->gpp_attrib, (set) ? "" : "un"); if (entry) G_PART_FULLNAME(table, entry, sb, gp->name); else sbuf_cat(sb, gp->name); sbuf_cat(sb, "\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_undo(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_provider *pp; struct g_geom *gp; struct g_part_entry *entry, *tmp; struct g_part_table *table; int error, reprobe; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (!table->gpt_opened) { gctl_error(req, "%d", EPERM); return (EPERM); } cp = LIST_FIRST(&gp->consumer); LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { entry->gpe_modified = 0; if (entry->gpe_created) { pp = entry->gpe_pp; if (pp != NULL) { pp->private = NULL; entry->gpe_pp = NULL; g_wither_provider(pp, ENXIO); } entry->gpe_deleted = 1; } if (entry->gpe_deleted) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } } g_topology_unlock(); reprobe = (table->gpt_scheme == &g_part_null_scheme || table->gpt_created) ? 1 : 0; if (reprobe) { LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_internal) continue; error = EBUSY; goto fail; } while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } error = g_part_probe(gp, cp, table->gpt_depth); if (error) { g_topology_lock(); g_access(cp, -1, -1, -1); g_part_wither(gp, error); return (0); } table = gp->softc; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ pp = cp->provider; g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); } error = G_PART_READ(table, cp); if (error) goto fail; error = g_part_check_integrity(table, cp); if (error) goto fail; g_topology_lock(); LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (!entry->gpe_internal) g_part_new_provider(gp, table, entry); } table->gpt_opened = 0; g_access(cp, -1, -1, -1); return (0); fail: g_topology_lock(); gctl_error(req, "%d", error); return (error); } static void g_part_wither(struct g_geom *gp, int error) { struct g_part_entry *entry; struct g_part_table *table; table = gp->softc; if (table != NULL) { G_PART_DESTROY(table, NULL); while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } if (gp->softc != NULL) { kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = NULL; } } g_wither_geom(gp, error); } /* * Class methods. */ static void g_part_ctlreq(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_part_parms gpp; struct g_part_table *table; struct gctl_req_arg *ap; enum g_part_ctl ctlreq; unsigned int i, mparms, oparms, parm; int auto_commit, close_on_error; int error, modifies; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, verb)); g_topology_assert(); ctlreq = G_PART_CTL_NONE; modifies = 1; mparms = 0; oparms = G_PART_PARM_FLAGS | G_PART_PARM_OUTPUT | G_PART_PARM_VERSION; switch (*verb) { case 'a': if (!strcmp(verb, "add")) { ctlreq = G_PART_CTL_ADD; mparms |= G_PART_PARM_GEOM | G_PART_PARM_SIZE | G_PART_PARM_START | G_PART_PARM_TYPE; oparms |= G_PART_PARM_INDEX | G_PART_PARM_LABEL; } break; case 'b': if (!strcmp(verb, "bootcode")) { ctlreq = G_PART_CTL_BOOTCODE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_BOOTCODE; } break; case 'c': if (!strcmp(verb, "commit")) { ctlreq = G_PART_CTL_COMMIT; mparms |= G_PART_PARM_GEOM; modifies = 0; } else if (!strcmp(verb, "create")) { ctlreq = G_PART_CTL_CREATE; mparms |= G_PART_PARM_PROVIDER | G_PART_PARM_SCHEME; oparms |= G_PART_PARM_ENTRIES; } break; case 'd': if (!strcmp(verb, "delete")) { ctlreq = G_PART_CTL_DELETE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; } else if (!strcmp(verb, "destroy")) { ctlreq = G_PART_CTL_DESTROY; mparms |= G_PART_PARM_GEOM; oparms |= G_PART_PARM_FORCE; } break; case 'm': if (!strcmp(verb, "modify")) { ctlreq = G_PART_CTL_MODIFY; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; oparms |= G_PART_PARM_LABEL | G_PART_PARM_TYPE; } else if (!strcmp(verb, "move")) { ctlreq = G_PART_CTL_MOVE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; } break; case 'r': if (!strcmp(verb, "recover")) { ctlreq = G_PART_CTL_RECOVER; mparms |= G_PART_PARM_GEOM; } else if (!strcmp(verb, "resize")) { ctlreq = G_PART_CTL_RESIZE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX | G_PART_PARM_SIZE; } break; case 's': if (!strcmp(verb, "set")) { ctlreq = G_PART_CTL_SET; mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM; oparms |= G_PART_PARM_INDEX; } break; case 'u': if (!strcmp(verb, "undo")) { ctlreq = G_PART_CTL_UNDO; mparms |= G_PART_PARM_GEOM; modifies = 0; } else if (!strcmp(verb, "unset")) { ctlreq = G_PART_CTL_UNSET; mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM; oparms |= G_PART_PARM_INDEX; } break; } if (ctlreq == G_PART_CTL_NONE) { gctl_error(req, "%d verb '%s'", EINVAL, verb); return; } bzero(&gpp, sizeof(gpp)); for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; parm = 0; switch (ap->name[0]) { case 'a': if (!strcmp(ap->name, "arg0")) { parm = mparms & (G_PART_PARM_GEOM | G_PART_PARM_PROVIDER); } if (!strcmp(ap->name, "attrib")) parm = G_PART_PARM_ATTRIB; break; case 'b': if (!strcmp(ap->name, "bootcode")) parm = G_PART_PARM_BOOTCODE; break; case 'c': if (!strcmp(ap->name, "class")) continue; break; case 'e': if (!strcmp(ap->name, "entries")) parm = G_PART_PARM_ENTRIES; break; case 'f': if (!strcmp(ap->name, "flags")) parm = G_PART_PARM_FLAGS; else if (!strcmp(ap->name, "force")) parm = G_PART_PARM_FORCE; break; case 'i': if (!strcmp(ap->name, "index")) parm = G_PART_PARM_INDEX; break; case 'l': if (!strcmp(ap->name, "label")) parm = G_PART_PARM_LABEL; break; case 'o': if (!strcmp(ap->name, "output")) parm = G_PART_PARM_OUTPUT; break; case 's': if (!strcmp(ap->name, "scheme")) parm = G_PART_PARM_SCHEME; else if (!strcmp(ap->name, "size")) parm = G_PART_PARM_SIZE; else if (!strcmp(ap->name, "start")) parm = G_PART_PARM_START; break; case 't': if (!strcmp(ap->name, "type")) parm = G_PART_PARM_TYPE; break; case 'v': if (!strcmp(ap->name, "verb")) continue; else if (!strcmp(ap->name, "version")) parm = G_PART_PARM_VERSION; break; } if ((parm & (mparms | oparms)) == 0) { gctl_error(req, "%d param '%s'", EINVAL, ap->name); return; } switch (parm) { case G_PART_PARM_ATTRIB: error = g_part_parm_str(req, ap->name, &gpp.gpp_attrib); break; case G_PART_PARM_BOOTCODE: error = g_part_parm_bootcode(req, ap->name, &gpp.gpp_codeptr, &gpp.gpp_codesize); break; case G_PART_PARM_ENTRIES: error = g_part_parm_intmax(req, ap->name, &gpp.gpp_entries); break; case G_PART_PARM_FLAGS: error = g_part_parm_str(req, ap->name, &gpp.gpp_flags); break; case G_PART_PARM_FORCE: error = g_part_parm_uint32(req, ap->name, &gpp.gpp_force); break; case G_PART_PARM_GEOM: error = g_part_parm_geom(req, ap->name, &gpp.gpp_geom); break; case G_PART_PARM_INDEX: error = g_part_parm_intmax(req, ap->name, &gpp.gpp_index); break; case G_PART_PARM_LABEL: error = g_part_parm_str(req, ap->name, &gpp.gpp_label); break; case G_PART_PARM_OUTPUT: error = 0; /* Write-only parameter */ break; case G_PART_PARM_PROVIDER: error = g_part_parm_provider(req, ap->name, &gpp.gpp_provider); break; case G_PART_PARM_SCHEME: error = g_part_parm_scheme(req, ap->name, &gpp.gpp_scheme); break; case G_PART_PARM_SIZE: error = g_part_parm_quad(req, ap->name, &gpp.gpp_size); break; case G_PART_PARM_START: error = g_part_parm_quad(req, ap->name, &gpp.gpp_start); break; case G_PART_PARM_TYPE: error = g_part_parm_str(req, ap->name, &gpp.gpp_type); break; case G_PART_PARM_VERSION: error = g_part_parm_uint32(req, ap->name, &gpp.gpp_version); break; default: error = EDOOFUS; gctl_error(req, "%d %s", error, ap->name); break; } if (error != 0) { if (error == ENOATTR) { gctl_error(req, "%d param '%s'", error, ap->name); } return; } gpp.gpp_parms |= parm; } if ((gpp.gpp_parms & mparms) != mparms) { parm = mparms - (gpp.gpp_parms & mparms); gctl_error(req, "%d param '%x'", ENOATTR, parm); return; } /* Obtain permissions if possible/necessary. */ close_on_error = 0; table = NULL; if (modifies && (gpp.gpp_parms & G_PART_PARM_GEOM)) { table = gpp.gpp_geom->softc; if (table != NULL && table->gpt_corrupt && ctlreq != G_PART_CTL_DESTROY && ctlreq != G_PART_CTL_RECOVER) { gctl_error(req, "%d table '%s' is corrupt", EPERM, gpp.gpp_geom->name); return; } if (table != NULL && !table->gpt_opened) { error = g_access(LIST_FIRST(&gpp.gpp_geom->consumer), 1, 1, 1); if (error) { gctl_error(req, "%d geom '%s'", error, gpp.gpp_geom->name); return; } table->gpt_opened = 1; close_on_error = 1; } } /* Allow the scheme to check or modify the parameters. */ if (table != NULL) { error = G_PART_PRECHECK(table, ctlreq, &gpp); if (error) { gctl_error(req, "%d pre-check failed", error); goto out; } } else error = EDOOFUS; /* Prevent bogus uninit. warning. */ switch (ctlreq) { case G_PART_CTL_NONE: panic("%s", __func__); case G_PART_CTL_ADD: error = g_part_ctl_add(req, &gpp); break; case G_PART_CTL_BOOTCODE: error = g_part_ctl_bootcode(req, &gpp); break; case G_PART_CTL_COMMIT: error = g_part_ctl_commit(req, &gpp); break; case G_PART_CTL_CREATE: error = g_part_ctl_create(req, &gpp); break; case G_PART_CTL_DELETE: error = g_part_ctl_delete(req, &gpp); break; case G_PART_CTL_DESTROY: error = g_part_ctl_destroy(req, &gpp); break; case G_PART_CTL_MODIFY: error = g_part_ctl_modify(req, &gpp); break; case G_PART_CTL_MOVE: error = g_part_ctl_move(req, &gpp); break; case G_PART_CTL_RECOVER: error = g_part_ctl_recover(req, &gpp); break; case G_PART_CTL_RESIZE: error = g_part_ctl_resize(req, &gpp); break; case G_PART_CTL_SET: error = g_part_ctl_setunset(req, &gpp, 1); break; case G_PART_CTL_UNDO: error = g_part_ctl_undo(req, &gpp); break; case G_PART_CTL_UNSET: error = g_part_ctl_setunset(req, &gpp, 0); break; } /* Implement automatic commit. */ if (!error) { auto_commit = (modifies && (gpp.gpp_parms & G_PART_PARM_FLAGS) && strchr(gpp.gpp_flags, 'C') != NULL) ? 1 : 0; if (auto_commit) { KASSERT(gpp.gpp_parms & G_PART_PARM_GEOM, ("%s", __func__)); error = g_part_ctl_commit(req, &gpp); } } out: if (error && close_on_error) { g_access(LIST_FIRST(&gpp.gpp_geom->consumer), -1, -1, -1); table->gpt_opened = 0; } } static int g_part_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, gp->name)); g_topology_assert(); g_part_wither(gp, EINVAL); return (0); } static struct g_geom * g_part_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct root_hold_token *rht; struct g_geom_alias *gap; int attr, depth; int error; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name)); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); /* * Create a GEOM with consumer and hook it up to the provider. * With that we become part of the topology. Obtain read access * to the provider. */ gp = g_new_geomf(mp, "%s", pp->name); LIST_FOREACH(gap, &pp->geom->aliases, ga_next) g_geom_add_alias(gp, gap->ga_alias); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error != 0) { if (cp->provider) g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } rht = root_mount_hold(mp->name); g_topology_unlock(); /* * Short-circuit the whole probing galore when there's no * media present. */ if (pp->mediasize == 0 || pp->sectorsize == 0) { error = ENODEV; goto fail; } /* Make sure we can nest and if so, determine our depth. */ error = g_getattr("PART::isleaf", cp, &attr); if (!error && attr) { error = ENODEV; goto fail; } error = g_getattr("PART::depth", cp, &attr); depth = (!error) ? attr + 1 : 0; error = g_part_probe(gp, cp, depth); if (error) goto fail; table = gp->softc; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); error = G_PART_READ(table, cp); if (error) goto fail; error = g_part_check_integrity(table, cp); if (error) goto fail; g_topology_lock(); LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (!entry->gpe_internal) g_part_new_provider(gp, table, entry); } root_mount_rel(rht); g_access(cp, -1, 0, 0); return (gp); fail: g_topology_lock(); root_mount_rel(rht); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } /* * Geom methods. */ static int g_part_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp; G_PART_TRACE((G_T_ACCESS, "%s(%s,%d,%d,%d)", __func__, pp->name, dr, dw, de)); cp = LIST_FIRST(&pp->geom->consumer); /* We always gain write-exclusive access. */ return (g_access(cp, dr, dw, dw + de)); } static void g_part_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { char buf[64]; struct g_part_entry *entry; struct g_part_table *table; KASSERT(sb != NULL && gp != NULL, ("%s", __func__)); table = gp->softc; if (indent == NULL) { KASSERT(cp == NULL && pp != NULL, ("%s", __func__)); entry = pp->private; if (entry == NULL) return; sbuf_printf(sb, " i %u o %ju ty %s", entry->gpe_index, (uintmax_t)entry->gpe_offset, G_PART_TYPE(table, entry, buf, sizeof(buf))); /* * libdisk compatibility quirk - the scheme dumps the * slicer name and partition type in a way that is * compatible with libdisk. When libdisk is not used * anymore, this should go away. */ G_PART_DUMPCONF(table, entry, sb, indent); } else if (cp != NULL) { /* Consumer configuration. */ KASSERT(pp == NULL, ("%s", __func__)); /* none */ } else if (pp != NULL) { /* Provider configuration. */ entry = pp->private; if (entry == NULL) return; sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_start); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_end); sbuf_printf(sb, "%s%u\n", indent, entry->gpe_index); sbuf_printf(sb, "%s%s\n", indent, G_PART_TYPE(table, entry, buf, sizeof(buf))); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_offset); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)pp->mediasize); G_PART_DUMPCONF(table, entry, sb, indent); } else { /* Geom configuration. */ sbuf_printf(sb, "%s%s\n", indent, table->gpt_scheme->name); sbuf_printf(sb, "%s%u\n", indent, table->gpt_entries); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->gpt_first); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->gpt_last); sbuf_printf(sb, "%s%u\n", indent, table->gpt_sectors); sbuf_printf(sb, "%s%u\n", indent, table->gpt_heads); sbuf_printf(sb, "%s%s\n", indent, table->gpt_corrupt ? "CORRUPT": "OK"); sbuf_printf(sb, "%s%s\n", indent, table->gpt_opened ? "true": "false"); G_PART_DUMPCONF(table, NULL, sb, indent); } } /*- * This start routine is only called for non-trivial requests, all the * trivial ones are handled autonomously by the slice code. * For requests we handle here, we must call the g_io_deliver() on the * bio, and return non-zero to indicate to the slice code that we did so. * This code executes in the "DOWN" I/O path, this means: * * No sleeping. * * Don't grab the topology lock. * * Don't call biowait, g_getattr(), g_setattr() or g_read_data() */ static int g_part_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_part_table *table; table = pp->geom->softc; return G_PART_IOCTL(table, pp, cmd, data, fflag, td); } static void g_part_resize(struct g_consumer *cp) { struct g_part_table *table; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name)); g_topology_assert(); if (auto_resize == 0) return; table = cp->geom->softc; if (table->gpt_opened == 0) { if (g_access(cp, 1, 1, 1) != 0) return; table->gpt_opened = 1; } if (G_PART_RESIZE(table, NULL, NULL) == 0) printf("GEOM_PART: %s was automatically resized.\n" " Use `gpart commit %s` to save changes or " "`gpart undo %s` to revert them.\n", cp->geom->name, cp->geom->name, cp->geom->name); if (g_part_check_integrity(table, cp) != 0) { g_access(cp, -1, -1, -1); table->gpt_opened = 0; g_part_wither(table->gpt_gp, ENXIO); } } static void g_part_orphan(struct g_consumer *cp) { struct g_provider *pp; struct g_part_table *table; pp = cp->provider; KASSERT(pp != NULL, ("%s", __func__)); G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name)); g_topology_assert(); KASSERT(pp->error != 0, ("%s", __func__)); table = cp->geom->softc; if (table != NULL && table->gpt_opened) g_access(cp, -1, -1, -1); g_part_wither(cp->geom, pp->error); } static void g_part_spoiled(struct g_consumer *cp) { G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name)); g_topology_assert(); cp->flags |= G_CF_ORPHAN; g_part_wither(cp->geom, ENXIO); } static void g_part_start(struct bio *bp) { struct bio *bp2; struct g_consumer *cp; struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct g_kerneldump *gkd; struct g_provider *pp; char buf[64]; biotrack(bp, __func__); pp = bp->bio_to; gp = pp->geom; table = gp->softc; cp = LIST_FIRST(&gp->consumer); G_PART_TRACE((G_T_BIO, "%s: cmd=%d, provider=%s", __func__, bp->bio_cmd, pp->name)); entry = pp->private; if (entry == NULL) { g_io_deliver(bp, ENXIO); return; } switch(bp->bio_cmd) { case BIO_DELETE: case BIO_READ: case BIO_WRITE: if (bp->bio_offset >= pp->mediasize) { g_io_deliver(bp, EIO); return; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } if (bp2->bio_offset + bp2->bio_length > pp->mediasize) bp2->bio_length = pp->mediasize - bp2->bio_offset; bp2->bio_done = g_std_done; bp2->bio_offset += entry->gpe_offset; g_io_request(bp2, cp); return; case BIO_FLUSH: break; case BIO_GETATTR: if (g_handleattr_int(bp, "GEOM::fwheads", table->gpt_heads)) return; if (g_handleattr_int(bp, "GEOM::fwsectors", table->gpt_sectors)) return; if (g_handleattr_int(bp, "PART::isleaf", table->gpt_isleaf)) return; if (g_handleattr_int(bp, "PART::depth", table->gpt_depth)) return; if (g_handleattr_str(bp, "PART::scheme", table->gpt_scheme->name)) return; if (g_handleattr_str(bp, "PART::type", G_PART_TYPE(table, entry, buf, sizeof(buf)))) return; if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) { /* * Check that the partition is suitable for kernel * dumps. Typically only swap partitions should be * used. If the request comes from the nested scheme * we allow dumping there as well. */ if ((bp->bio_from == NULL || bp->bio_from->geom->class != &g_part_class) && G_PART_DUMPTO(table, entry) == 0) { g_io_deliver(bp, ENODEV); printf("GEOM_PART: Partition '%s' not suitable" " for kernel dumps (wrong type?)\n", pp->name); return; } gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->offset >= pp->mediasize) { g_io_deliver(bp, EIO); return; } if (gkd->offset + gkd->length > pp->mediasize) gkd->length = pp->mediasize - gkd->offset; gkd->offset += entry->gpe_offset; } break; default: g_io_deliver(bp, EOPNOTSUPP); return; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_std_done; g_io_request(bp2, cp); } static void g_part_init(struct g_class *mp) { TAILQ_INSERT_HEAD(&g_part_schemes, &g_part_null_scheme, scheme_list); } static void g_part_fini(struct g_class *mp) { TAILQ_REMOVE(&g_part_schemes, &g_part_null_scheme, scheme_list); } static void g_part_unload_event(void *arg, int flag) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_scheme *scheme; struct g_part_table *table; uintptr_t *xchg; int acc, error; if (flag == EV_CANCEL) return; xchg = arg; error = 0; scheme = (void *)(*xchg); g_topology_assert(); LIST_FOREACH(gp, &g_part_class.geom, geom) { table = gp->softc; if (table->gpt_scheme != scheme) continue; acc = 0; LIST_FOREACH(pp, &gp->provider, provider) acc += pp->acr + pp->acw + pp->ace; LIST_FOREACH(cp, &gp->consumer, consumer) acc += cp->acr + cp->acw + cp->ace; if (!acc) g_part_wither(gp, ENOSYS); else error = EBUSY; } if (!error) TAILQ_REMOVE(&g_part_schemes, scheme, scheme_list); *xchg = error; } int g_part_modevent(module_t mod, int type, struct g_part_scheme *scheme) { struct g_part_scheme *iter; uintptr_t arg; int error; error = 0; switch (type) { case MOD_LOAD: TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) { if (scheme == iter) { printf("GEOM_PART: scheme %s is already " "registered!\n", scheme->name); break; } } if (iter == NULL) { TAILQ_INSERT_TAIL(&g_part_schemes, scheme, scheme_list); g_retaste(&g_part_class); } break; case MOD_UNLOAD: arg = (uintptr_t)scheme; error = g_waitfor_event(g_part_unload_event, &arg, M_WAITOK, NULL); if (error == 0) error = arg; break; default: error = EOPNOTSUPP; break; } return (error); } Index: head/sys/geom/part/g_part.h =================================================================== --- head/sys/geom/part/g_part.h (revision 326269) +++ head/sys/geom/part/g_part.h (revision 326270) @@ -1,238 +1,240 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006-2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_PART_H_ #define _GEOM_PART_H_ #define G_PART_TRACE(args) g_trace args #define G_PART_PROBE_PRI_LOW -10 #define G_PART_PROBE_PRI_NORM -5 #define G_PART_PROBE_PRI_HIGH 0 enum g_part_alias { G_PART_ALIAS_APPLE_BOOT, /* An Apple boot partition entry. */ G_PART_ALIAS_APPLE_CORE_STORAGE,/* An Apple Core Storage partition. */ G_PART_ALIAS_APPLE_HFS, /* An HFS+ file system entry. */ G_PART_ALIAS_APPLE_LABEL, /* An Apple label partition entry. */ G_PART_ALIAS_APPLE_RAID, /* An Apple RAID partition entry. */ G_PART_ALIAS_APPLE_RAID_OFFLINE,/* An Apple RAID (offline) part entry.*/ G_PART_ALIAS_APPLE_TV_RECOVERY, /* An Apple TV recovery part entry. */ G_PART_ALIAS_APPLE_UFS, /* An Apple UFS partition entry. */ G_PART_ALIAS_BIOS_BOOT, /* A GRUB 2 boot partition entry. */ G_PART_ALIAS_CHROMEOS_FIRMWARE, /* A ChromeOS firmware part. entry. */ G_PART_ALIAS_CHROMEOS_KERNEL, /* A ChromeOS Kernel part. entry. */ G_PART_ALIAS_CHROMEOS_RESERVED, /* ChromeOS. Reserved for future use. */ G_PART_ALIAS_CHROMEOS_ROOT, /* A ChromeOS root part. entry. */ G_PART_ALIAS_DFBSD, /* A DfBSD label32 partition entry */ G_PART_ALIAS_DFBSD64, /* A DfBSD label64 partition entry */ G_PART_ALIAS_DFBSD_CCD, /* A DfBSD CCD partition entry */ G_PART_ALIAS_DFBSD_HAMMER, /* A DfBSD HAMMER FS partition entry */ G_PART_ALIAS_DFBSD_HAMMER2, /* A DfBSD HAMMER2 FS partition entry */ G_PART_ALIAS_DFBSD_LEGACY, /* A DfBSD legacy partition entry */ G_PART_ALIAS_DFBSD_SWAP, /* A DfBSD swap partition entry */ G_PART_ALIAS_DFBSD_UFS, /* A DfBSD UFS partition entry */ G_PART_ALIAS_DFBSD_VINUM, /* A DfBSD Vinum partition entry */ G_PART_ALIAS_EBR, /* A EBR partition entry. */ G_PART_ALIAS_EFI, /* A EFI system partition entry. */ G_PART_ALIAS_FREEBSD, /* A BSD labeled partition entry. */ G_PART_ALIAS_FREEBSD_BOOT, /* A FreeBSD boot partition entry. */ G_PART_ALIAS_FREEBSD_NANDFS, /* A FreeBSD nandfs partition entry. */ G_PART_ALIAS_FREEBSD_SWAP, /* A swap partition entry. */ G_PART_ALIAS_FREEBSD_UFS, /* A UFS/UFS2 file system entry. */ G_PART_ALIAS_FREEBSD_VINUM, /* A Vinum partition entry. */ G_PART_ALIAS_FREEBSD_ZFS, /* A ZFS file system entry. */ G_PART_ALIAS_LINUX_DATA, /* A Linux data partition entry. */ G_PART_ALIAS_LINUX_LVM, /* A Linux LVM partition entry. */ G_PART_ALIAS_LINUX_RAID, /* A Linux RAID partition entry. */ G_PART_ALIAS_LINUX_SWAP, /* A Linux swap partition entry. */ G_PART_ALIAS_MBR, /* A MBR (extended) partition entry. */ G_PART_ALIAS_MS_BASIC_DATA, /* A Microsoft Data part. entry. */ G_PART_ALIAS_MS_FAT16, /* A Microsoft FAT16 partition entry. */ G_PART_ALIAS_MS_FAT32, /* A Microsoft FAT32 partition entry. */ G_PART_ALIAS_MS_LDM_DATA, /* A Microsoft LDM Data part. entry. */ G_PART_ALIAS_MS_LDM_METADATA, /* A Microsoft LDM Metadata entry. */ G_PART_ALIAS_MS_NTFS, /* A Microsoft NTFS partition entry */ G_PART_ALIAS_MS_RECOVERY, /* A Microsoft recovery part. entry. */ G_PART_ALIAS_MS_RESERVED, /* A Microsoft Reserved part. entry. */ G_PART_ALIAS_MS_SPACES, /* A Microsoft Spaces part. entry. */ G_PART_ALIAS_NETBSD_CCD, /* A NetBSD CCD partition entry. */ G_PART_ALIAS_NETBSD_CGD, /* A NetBSD CGD partition entry. */ G_PART_ALIAS_NETBSD_FFS, /* A NetBSD FFS partition entry. */ G_PART_ALIAS_NETBSD_LFS, /* A NetBSD LFS partition entry. */ G_PART_ALIAS_NETBSD_RAID, /* A NetBSD RAID partition entry. */ G_PART_ALIAS_NETBSD_SWAP, /* A NetBSD swap partition entry. */ G_PART_ALIAS_OPENBSD_DATA, /* An OpenBSD data partition entry. */ G_PART_ALIAS_PREP_BOOT, /* A PREP/CHRP boot partition entry. */ G_PART_ALIAS_VMFS, /* A VMware VMFS partition entry */ G_PART_ALIAS_VMKDIAG, /* A VMware vmkDiagnostic partition entry */ G_PART_ALIAS_VMRESERVED, /* A VMware reserved partition entry */ G_PART_ALIAS_VMVSANHDR, /* A VMware vSAN header partition entry */ /* Keep the following last */ G_PART_ALIAS_COUNT }; const char *g_part_alias_name(enum g_part_alias); /* G_PART scheme (KOBJ class). */ struct g_part_scheme { KOBJ_CLASS_FIELDS; size_t gps_entrysz; int gps_minent; int gps_maxent; int gps_bootcodesz; TAILQ_ENTRY(g_part_scheme) scheme_list; }; struct g_part_entry { LIST_ENTRY(g_part_entry) gpe_entry; struct g_provider *gpe_pp; /* Corresponding provider. */ off_t gpe_offset; /* Byte offset. */ quad_t gpe_start; /* First LBA of partition. */ quad_t gpe_end; /* Last LBA of partition. */ int gpe_index; int gpe_created:1; /* Entry is newly created. */ int gpe_deleted:1; /* Entry has been deleted. */ int gpe_modified:1; /* Entry has been modified. */ int gpe_internal:1; /* Entry is not a used entry. */ }; /* G_PART table (KOBJ instance). */ struct g_part_table { KOBJ_FIELDS; struct g_part_scheme *gpt_scheme; struct g_geom *gpt_gp; LIST_HEAD(, g_part_entry) gpt_entry; quad_t gpt_first; /* First allocatable LBA */ quad_t gpt_last; /* Last allocatable LBA */ int gpt_entries; /* * gpt_smhead and gpt_smtail are bitmaps representing the first * 32 sectors on the disk (gpt_smhead) and the last 32 sectors * on the disk (gpt_smtail). These maps are used by the commit * verb to clear sectors previously used by a scheme after the * partitioning scheme has been destroyed. */ uint32_t gpt_smhead; uint32_t gpt_smtail; /* * gpt_sectors and gpt_heads are the fixed or synchesized number * of sectors per track and heads (resp) that make up a disks * geometry. This is to support partitioning schemes as well as * file systems that work on a geometry. The MBR scheme and the * MS-DOS (FAT) file system come to mind. * We keep track of whether the geometry is fixed or synchesized * so that a partitioning scheme can correct the synthesized * geometry, based on the on-disk metadata. */ uint32_t gpt_sectors; uint32_t gpt_heads; int gpt_depth; /* Sub-partitioning level. */ int gpt_isleaf:1; /* Cannot be sub-partitioned. */ int gpt_created:1; /* Newly created. */ int gpt_modified:1; /* Table changes have been made. */ int gpt_opened:1; /* Permissions obtained. */ int gpt_fixgeom:1; /* Geometry is fixed. */ int gpt_corrupt:1; /* Table is corrupt. */ }; struct g_part_entry *g_part_new_entry(struct g_part_table *, int, quad_t, quad_t); enum g_part_ctl { G_PART_CTL_NONE, G_PART_CTL_ADD, G_PART_CTL_BOOTCODE, G_PART_CTL_COMMIT, G_PART_CTL_CREATE, G_PART_CTL_DELETE, G_PART_CTL_DESTROY, G_PART_CTL_MODIFY, G_PART_CTL_MOVE, G_PART_CTL_RECOVER, G_PART_CTL_RESIZE, G_PART_CTL_SET, G_PART_CTL_UNDO, G_PART_CTL_UNSET }; /* G_PART ctlreq parameters. */ #define G_PART_PARM_ENTRIES 0x0001 #define G_PART_PARM_FLAGS 0x0002 #define G_PART_PARM_GEOM 0x0004 #define G_PART_PARM_INDEX 0x0008 #define G_PART_PARM_LABEL 0x0010 #define G_PART_PARM_OUTPUT 0x0020 #define G_PART_PARM_PROVIDER 0x0040 #define G_PART_PARM_SCHEME 0x0080 #define G_PART_PARM_SIZE 0x0100 #define G_PART_PARM_START 0x0200 #define G_PART_PARM_TYPE 0x0400 #define G_PART_PARM_VERSION 0x0800 #define G_PART_PARM_BOOTCODE 0x1000 #define G_PART_PARM_ATTRIB 0x2000 #define G_PART_PARM_FORCE 0x4000 struct g_part_parms { unsigned int gpp_parms; unsigned int gpp_entries; const char *gpp_flags; struct g_geom *gpp_geom; unsigned int gpp_index; const char *gpp_label; struct g_provider *gpp_provider; struct g_part_scheme *gpp_scheme; quad_t gpp_size; quad_t gpp_start; const char *gpp_type; unsigned int gpp_version; const void *gpp_codeptr; unsigned int gpp_codesize; const char *gpp_attrib; unsigned int gpp_force; }; void g_part_geometry_heads(off_t, u_int, off_t *, u_int *); int g_part_modevent(module_t, int, struct g_part_scheme *); #define G_PART_SCHEME_DECLARE(name) \ static int name##_modevent(module_t mod, int tp, void *d) \ { \ return (g_part_modevent(mod, tp, d)); \ } \ static moduledata_t name##_mod = { \ #name, \ name##_modevent, \ &name##_scheme \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_ANY); \ MODULE_DEPEND(name, g_part, 0, 0, 0) #endif /* !_GEOM_PART_H_ */ Index: head/sys/geom/part/g_part_apm.c =================================================================== --- head/sys/geom/part/g_part_apm.c (revision 326269) +++ head/sys/geom/part/g_part_apm.c (revision 326270) @@ -1,594 +1,596 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006-2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_apm, "GEOM partitioning class for Apple-style partitions"); struct g_part_apm_table { struct g_part_table base; struct apm_ddr ddr; struct apm_ent self; int tivo_series1; }; struct g_part_apm_entry { struct g_part_entry base; struct apm_ent ent; }; static int g_part_apm_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_apm_create(struct g_part_table *, struct g_part_parms *); static int g_part_apm_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_apm_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_apm_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_apm_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_apm_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_apm_probe(struct g_part_table *, struct g_consumer *); static int g_part_apm_read(struct g_part_table *, struct g_consumer *); static const char *g_part_apm_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_apm_write(struct g_part_table *, struct g_consumer *); static int g_part_apm_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_apm_methods[] = { KOBJMETHOD(g_part_add, g_part_apm_add), KOBJMETHOD(g_part_create, g_part_apm_create), KOBJMETHOD(g_part_destroy, g_part_apm_destroy), KOBJMETHOD(g_part_dumpconf, g_part_apm_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_apm_dumpto), KOBJMETHOD(g_part_modify, g_part_apm_modify), KOBJMETHOD(g_part_resize, g_part_apm_resize), KOBJMETHOD(g_part_name, g_part_apm_name), KOBJMETHOD(g_part_probe, g_part_apm_probe), KOBJMETHOD(g_part_read, g_part_apm_read), KOBJMETHOD(g_part_type, g_part_apm_type), KOBJMETHOD(g_part_write, g_part_apm_write), { 0, 0 } }; static struct g_part_scheme g_part_apm_scheme = { "APM", g_part_apm_methods, sizeof(struct g_part_apm_table), .gps_entrysz = sizeof(struct g_part_apm_entry), .gps_minent = 16, .gps_maxent = 4096, }; G_PART_SCHEME_DECLARE(g_part_apm); static void swab(char *buf, size_t bufsz) { int i; char ch; for (i = 0; i < bufsz; i += 2) { ch = buf[i]; buf[i] = buf[i + 1]; buf[i + 1] = ch; } } static int apm_parse_type(const char *type, char *buf, size_t bufsz) { const char *alias; if (type[0] == '!') { type++; if (strlen(type) > bufsz) return (EINVAL); if (!strcmp(type, APM_ENT_TYPE_SELF) || !strcmp(type, APM_ENT_TYPE_UNUSED)) return (EINVAL); strncpy(buf, type, bufsz); return (0); } alias = g_part_alias_name(G_PART_ALIAS_APPLE_BOOT); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_APPLE_BOOT); return (0); } alias = g_part_alias_name(G_PART_ALIAS_APPLE_HFS); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_APPLE_HFS); return (0); } alias = g_part_alias_name(G_PART_ALIAS_APPLE_UFS); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_APPLE_UFS); return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_FREEBSD); return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_FREEBSD_NANDFS); return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_FREEBSD_SWAP); return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_FREEBSD_UFS); return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_FREEBSD_VINUM); return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS); if (!strcasecmp(type, alias)) { strcpy(buf, APM_ENT_TYPE_FREEBSD_ZFS); return (0); } return (EINVAL); } static int apm_read_ent(struct g_consumer *cp, uint32_t blk, struct apm_ent *ent, int tivo_series1) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = g_read_data(cp, pp->sectorsize * blk, pp->sectorsize, &error); if (buf == NULL) return (error); if (tivo_series1) swab(buf, pp->sectorsize); ent->ent_sig = be16dec(buf); ent->ent_pmblkcnt = be32dec(buf + 4); ent->ent_start = be32dec(buf + 8); ent->ent_size = be32dec(buf + 12); bcopy(buf + 16, ent->ent_name, sizeof(ent->ent_name)); bcopy(buf + 48, ent->ent_type, sizeof(ent->ent_type)); g_free(buf); return (0); } static int g_part_apm_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_apm_entry *entry; struct g_part_apm_table *table; int error; entry = (struct g_part_apm_entry *)baseentry; table = (struct g_part_apm_table *)basetable; entry->ent.ent_sig = APM_ENT_SIG; entry->ent.ent_pmblkcnt = table->self.ent_pmblkcnt; entry->ent.ent_start = gpp->gpp_start; entry->ent.ent_size = gpp->gpp_size; if (baseentry->gpe_deleted) { bzero(entry->ent.ent_type, sizeof(entry->ent.ent_type)); bzero(entry->ent.ent_name, sizeof(entry->ent.ent_name)); } error = apm_parse_type(gpp->gpp_type, entry->ent.ent_type, sizeof(entry->ent.ent_type)); if (error) return (error); if (gpp->gpp_parms & G_PART_PARM_LABEL) { if (strlen(gpp->gpp_label) > sizeof(entry->ent.ent_name)) return (EINVAL); strncpy(entry->ent.ent_name, gpp->gpp_label, sizeof(entry->ent.ent_name)); } if (baseentry->gpe_index >= table->self.ent_pmblkcnt) table->self.ent_pmblkcnt = baseentry->gpe_index + 1; KASSERT(table->self.ent_size >= table->self.ent_pmblkcnt, ("%s", __func__)); KASSERT(table->self.ent_size > baseentry->gpe_index, ("%s", __func__)); return (0); } static int g_part_apm_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_apm_table *table; uint32_t last; /* We don't nest, which means that our depth should be 0. */ if (basetable->gpt_depth != 0) return (ENXIO); table = (struct g_part_apm_table *)basetable; pp = gpp->gpp_provider; if (pp->sectorsize != 512 || pp->mediasize < (2 + 2 * basetable->gpt_entries) * pp->sectorsize) return (ENOSPC); /* APM uses 32-bit LBAs. */ last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; basetable->gpt_first = 2 + basetable->gpt_entries; basetable->gpt_last = last; table->ddr.ddr_sig = APM_DDR_SIG; table->ddr.ddr_blksize = pp->sectorsize; table->ddr.ddr_blkcount = last + 1; table->self.ent_sig = APM_ENT_SIG; table->self.ent_pmblkcnt = basetable->gpt_entries + 1; table->self.ent_start = 1; table->self.ent_size = table->self.ent_pmblkcnt; strcpy(table->self.ent_name, "Apple"); strcpy(table->self.ent_type, APM_ENT_TYPE_SELF); return (0); } static int g_part_apm_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { /* Wipe the first 2 sectors to clear the partitioning. */ basetable->gpt_smhead |= 3; return (0); } static void g_part_apm_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { union { char name[APM_ENT_NAMELEN + 1]; char type[APM_ENT_TYPELEN + 1]; } u; struct g_part_apm_entry *entry; entry = (struct g_part_apm_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs APPLE xt %s", entry->ent.ent_type); } else if (entry != NULL) { /* confxml: partition entry information */ strncpy(u.name, entry->ent.ent_name, APM_ENT_NAMELEN); u.name[APM_ENT_NAMELEN] = '\0'; sbuf_printf(sb, "%s\n"); strncpy(u.type, entry->ent.ent_type, APM_ENT_TYPELEN); u.type[APM_ENT_TYPELEN] = '\0'; sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", u.type); sbuf_printf(sb, "\n"); } else { /* confxml: scheme information */ } } static int g_part_apm_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_apm_entry *entry; entry = (struct g_part_apm_entry *)baseentry; return ((!strcmp(entry->ent.ent_type, APM_ENT_TYPE_FREEBSD_SWAP)) ? 1 : 0); } static int g_part_apm_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_apm_entry *entry; int error; entry = (struct g_part_apm_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_LABEL) { if (strlen(gpp->gpp_label) > sizeof(entry->ent.ent_name)) return (EINVAL); } if (gpp->gpp_parms & G_PART_PARM_TYPE) { error = apm_parse_type(gpp->gpp_type, entry->ent.ent_type, sizeof(entry->ent.ent_type)); if (error) return (error); } if (gpp->gpp_parms & G_PART_PARM_LABEL) { strncpy(entry->ent.ent_name, gpp->gpp_label, sizeof(entry->ent.ent_name)); } return (0); } static int g_part_apm_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_apm_entry *entry; struct g_provider *pp; if (baseentry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; return (0); } entry = (struct g_part_apm_entry *)baseentry; baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1; entry->ent.ent_size = gpp->gpp_size; return (0); } static const char * g_part_apm_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "s%d", baseentry->gpe_index + 1); return (buf); } static int g_part_apm_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_apm_table *table; char *buf; int error; /* We don't nest, which means that our depth should be 0. */ if (basetable->gpt_depth != 0) return (ENXIO); table = (struct g_part_apm_table *)basetable; table->tivo_series1 = 0; pp = cp->provider; /* Sanity-check the provider. */ if (pp->mediasize < 4 * pp->sectorsize) return (ENOSPC); /* Check that there's a Driver Descriptor Record (DDR). */ buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); if (be16dec(buf) == APM_DDR_SIG) { /* Normal Apple DDR */ table->ddr.ddr_sig = be16dec(buf); table->ddr.ddr_blksize = be16dec(buf + 2); table->ddr.ddr_blkcount = be32dec(buf + 4); g_free(buf); if (table->ddr.ddr_blksize != pp->sectorsize) return (ENXIO); if (table->ddr.ddr_blkcount > pp->mediasize / pp->sectorsize) return (ENXIO); } else { /* * Check for Tivo drives, which have no DDR and a different * signature. Those whose first two bytes are 14 92 are * Series 2 drives, and aren't supported. Those that start * with 92 14 are series 1 drives and are supported. */ if (be16dec(buf) != 0x9214) { /* If this is 0x1492 it could be a series 2 drive */ g_free(buf); return (ENXIO); } table->ddr.ddr_sig = APM_DDR_SIG; /* XXX */ table->ddr.ddr_blksize = pp->sectorsize; /* XXX */ table->ddr.ddr_blkcount = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); table->tivo_series1 = 1; g_free(buf); } /* Check that there's a Partition Map. */ error = apm_read_ent(cp, 1, &table->self, table->tivo_series1); if (error) return (error); if (table->self.ent_sig != APM_ENT_SIG) return (ENXIO); if (strcmp(table->self.ent_type, APM_ENT_TYPE_SELF)) return (ENXIO); if (table->self.ent_pmblkcnt >= table->ddr.ddr_blkcount) return (ENXIO); return (G_PART_PROBE_PRI_NORM); } static int g_part_apm_read(struct g_part_table *basetable, struct g_consumer *cp) { struct apm_ent ent; struct g_part_apm_entry *entry; struct g_part_apm_table *table; int error, index; table = (struct g_part_apm_table *)basetable; basetable->gpt_first = table->self.ent_size + 1; basetable->gpt_last = table->ddr.ddr_blkcount - 1; basetable->gpt_entries = table->self.ent_size - 1; for (index = table->self.ent_pmblkcnt - 1; index > 0; index--) { error = apm_read_ent(cp, index + 1, &ent, table->tivo_series1); if (error) continue; if (!strcmp(ent.ent_type, APM_ENT_TYPE_UNUSED)) continue; entry = (struct g_part_apm_entry *)g_part_new_entry(basetable, index, ent.ent_start, ent.ent_start + ent.ent_size - 1); entry->ent = ent; } return (0); } static const char * g_part_apm_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_apm_entry *entry; const char *type; size_t len; entry = (struct g_part_apm_entry *)baseentry; type = entry->ent.ent_type; if (!strcmp(type, APM_ENT_TYPE_APPLE_BOOT)) return (g_part_alias_name(G_PART_ALIAS_APPLE_BOOT)); if (!strcmp(type, APM_ENT_TYPE_APPLE_HFS)) return (g_part_alias_name(G_PART_ALIAS_APPLE_HFS)); if (!strcmp(type, APM_ENT_TYPE_APPLE_UFS)) return (g_part_alias_name(G_PART_ALIAS_APPLE_UFS)); if (!strcmp(type, APM_ENT_TYPE_FREEBSD)) return (g_part_alias_name(G_PART_ALIAS_FREEBSD)); if (!strcmp(type, APM_ENT_TYPE_FREEBSD_NANDFS)) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS)); if (!strcmp(type, APM_ENT_TYPE_FREEBSD_SWAP)) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP)); if (!strcmp(type, APM_ENT_TYPE_FREEBSD_UFS)) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS)); if (!strcmp(type, APM_ENT_TYPE_FREEBSD_VINUM)) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM)); if (!strcmp(type, APM_ENT_TYPE_FREEBSD_ZFS)) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS)); buf[0] = '!'; len = MIN(sizeof(entry->ent.ent_type), bufsz - 2); bcopy(type, buf + 1, len); buf[len + 1] = '\0'; return (buf); } static int g_part_apm_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_apm_entry *entry; struct g_part_apm_table *table; char *buf, *ptr; uint32_t index; int error; size_t tblsz; pp = cp->provider; table = (struct g_part_apm_table *)basetable; /* * Tivo Series 1 disk partitions are currently read-only. */ if (table->tivo_series1) return (EOPNOTSUPP); /* Write the DDR only when we're newly created. */ if (basetable->gpt_created) { buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); be16enc(buf, table->ddr.ddr_sig); be16enc(buf + 2, table->ddr.ddr_blksize); be32enc(buf + 4, table->ddr.ddr_blkcount); error = g_write_data(cp, 0, buf, pp->sectorsize); g_free(buf); if (error) return (error); } /* Allocate the buffer for all entries */ tblsz = table->self.ent_pmblkcnt; buf = g_malloc(tblsz * pp->sectorsize, M_WAITOK | M_ZERO); /* Fill the self entry */ be16enc(buf, APM_ENT_SIG); be32enc(buf + 4, table->self.ent_pmblkcnt); be32enc(buf + 8, table->self.ent_start); be32enc(buf + 12, table->self.ent_size); bcopy(table->self.ent_name, buf + 16, sizeof(table->self.ent_name)); bcopy(table->self.ent_type, buf + 48, sizeof(table->self.ent_type)); baseentry = LIST_FIRST(&basetable->gpt_entry); for (index = 1; index < tblsz; index++) { entry = (baseentry != NULL && index == baseentry->gpe_index) ? (struct g_part_apm_entry *)baseentry : NULL; ptr = buf + index * pp->sectorsize; be16enc(ptr, APM_ENT_SIG); be32enc(ptr + 4, table->self.ent_pmblkcnt); if (entry != NULL && !baseentry->gpe_deleted) { be32enc(ptr + 8, entry->ent.ent_start); be32enc(ptr + 12, entry->ent.ent_size); bcopy(entry->ent.ent_name, ptr + 16, sizeof(entry->ent.ent_name)); bcopy(entry->ent.ent_type, ptr + 48, sizeof(entry->ent.ent_type)); } else { strcpy(ptr + 48, APM_ENT_TYPE_UNUSED); } if (entry != NULL) baseentry = LIST_NEXT(baseentry, gpe_entry); } for (index = 0; index < tblsz; index += MAXPHYS / pp->sectorsize) { error = g_write_data(cp, (1 + index) * pp->sectorsize, buf + index * pp->sectorsize, (tblsz - index > MAXPHYS / pp->sectorsize) ? MAXPHYS: (tblsz - index) * pp->sectorsize); if (error) { g_free(buf); return (error); } } g_free(buf); return (0); } Index: head/sys/geom/part/g_part_bsd.c =================================================================== --- head/sys/geom/part/g_part_bsd.c (revision 326269) +++ head/sys/geom/part/g_part_bsd.c (revision 326270) @@ -1,539 +1,541 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" #define BOOT1_SIZE 512 #define LABEL_SIZE 512 #define BOOT2_OFF (BOOT1_SIZE + LABEL_SIZE) #define BOOT2_SIZE (BBSIZE - BOOT2_OFF) FEATURE(geom_part_bsd, "GEOM partitioning class for BSD disklabels"); struct g_part_bsd_table { struct g_part_table base; u_char *bbarea; uint32_t offset; }; struct g_part_bsd_entry { struct g_part_entry base; struct partition part; }; static int g_part_bsd_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_bsd_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_bsd_create(struct g_part_table *, struct g_part_parms *); static int g_part_bsd_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_bsd_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_bsd_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_bsd_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_bsd_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd_probe(struct g_part_table *, struct g_consumer *); static int g_part_bsd_read(struct g_part_table *, struct g_consumer *); static const char *g_part_bsd_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd_write(struct g_part_table *, struct g_consumer *); static int g_part_bsd_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_bsd_methods[] = { KOBJMETHOD(g_part_add, g_part_bsd_add), KOBJMETHOD(g_part_bootcode, g_part_bsd_bootcode), KOBJMETHOD(g_part_create, g_part_bsd_create), KOBJMETHOD(g_part_destroy, g_part_bsd_destroy), KOBJMETHOD(g_part_dumpconf, g_part_bsd_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_bsd_dumpto), KOBJMETHOD(g_part_modify, g_part_bsd_modify), KOBJMETHOD(g_part_resize, g_part_bsd_resize), KOBJMETHOD(g_part_name, g_part_bsd_name), KOBJMETHOD(g_part_probe, g_part_bsd_probe), KOBJMETHOD(g_part_read, g_part_bsd_read), KOBJMETHOD(g_part_type, g_part_bsd_type), KOBJMETHOD(g_part_write, g_part_bsd_write), { 0, 0 } }; static struct g_part_scheme g_part_bsd_scheme = { "BSD", g_part_bsd_methods, sizeof(struct g_part_bsd_table), .gps_entrysz = sizeof(struct g_part_bsd_entry), .gps_minent = 8, .gps_maxent = 20, /* Only 22 entries fit in 512 byte sectors */ .gps_bootcodesz = BBSIZE, }; G_PART_SCHEME_DECLARE(g_part_bsd); static struct g_part_bsd_alias { uint8_t type; int alias; } bsd_alias_match[] = { { FS_BSDFFS, G_PART_ALIAS_FREEBSD_UFS }, { FS_SWAP, G_PART_ALIAS_FREEBSD_SWAP }, { FS_ZFS, G_PART_ALIAS_FREEBSD_ZFS }, { FS_VINUM, G_PART_ALIAS_FREEBSD_VINUM }, { FS_NANDFS, G_PART_ALIAS_FREEBSD_NANDFS }, { FS_HAMMER, G_PART_ALIAS_DFBSD_HAMMER }, { FS_HAMMER2, G_PART_ALIAS_DFBSD_HAMMER2 }, }; static int bsd_parse_type(const char *type, uint8_t *fstype) { const char *alias; char *endp; long lt; int i; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 256) return (EINVAL); *fstype = (u_int)lt; return (0); } for (i = 0; i < nitems(bsd_alias_match); i++) { alias = g_part_alias_name(bsd_alias_match[i].alias); if (strcasecmp(type, alias) == 0) { *fstype = bsd_alias_match[i].type; return (0); } } return (EINVAL); } static int g_part_bsd_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd_entry *entry; struct g_part_bsd_table *table; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd_entry *)baseentry; table = (struct g_part_bsd_table *)basetable; entry->part.p_size = gpp->gpp_size; entry->part.p_offset = gpp->gpp_start + table->offset; entry->part.p_fsize = 0; entry->part.p_frag = 0; entry->part.p_cpg = 0; return (bsd_parse_type(gpp->gpp_type, &entry->part.p_fstype)); } static int g_part_bsd_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_bsd_table *table; const u_char *codeptr; if (gpp->gpp_codesize != BOOT1_SIZE && gpp->gpp_codesize != BBSIZE) return (ENODEV); table = (struct g_part_bsd_table *)basetable; codeptr = gpp->gpp_codeptr; bcopy(codeptr, table->bbarea, BOOT1_SIZE); if (gpp->gpp_codesize == BBSIZE) bcopy(codeptr + BOOT2_OFF, table->bbarea + BOOT2_OFF, BOOT2_SIZE); return (0); } static int g_part_bsd_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_bsd_entry *entry; struct g_part_bsd_table *table; u_char *ptr; uint32_t msize, ncyls, secpercyl; pp = gpp->gpp_provider; if (pp->sectorsize < sizeof(struct disklabel)) return (ENOSPC); if (BBSIZE % pp->sectorsize) return (ENOTBLK); msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); secpercyl = basetable->gpt_sectors * basetable->gpt_heads; ncyls = msize / secpercyl; table = (struct g_part_bsd_table *)basetable; table->bbarea = g_malloc(BBSIZE, M_WAITOK | M_ZERO); ptr = table->bbarea + pp->sectorsize; le32enc(ptr + 0, DISKMAGIC); /* d_magic */ le32enc(ptr + 40, pp->sectorsize); /* d_secsize */ le32enc(ptr + 44, basetable->gpt_sectors); /* d_nsectors */ le32enc(ptr + 48, basetable->gpt_heads); /* d_ntracks */ le32enc(ptr + 52, ncyls); /* d_ncylinders */ le32enc(ptr + 56, secpercyl); /* d_secpercyl */ le32enc(ptr + 60, msize); /* d_secperunit */ le16enc(ptr + 72, 3600); /* d_rpm */ le32enc(ptr + 132, DISKMAGIC); /* d_magic2 */ le16enc(ptr + 138, basetable->gpt_entries); /* d_npartitions */ le32enc(ptr + 140, BBSIZE); /* d_bbsize */ basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_isleaf = 1; baseentry = g_part_new_entry(basetable, RAW_PART + 1, basetable->gpt_first, basetable->gpt_last); baseentry->gpe_internal = 1; entry = (struct g_part_bsd_entry *)baseentry; entry->part.p_size = basetable->gpt_last + 1; entry->part.p_offset = table->offset; return (0); } static int g_part_bsd_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_bsd_table *table; table = (struct g_part_bsd_table *)basetable; if (table->bbarea != NULL) g_free(table->bbarea); table->bbarea = NULL; /* Wipe the second sector to clear the partitioning. */ basetable->gpt_smhead |= 2; return (0); } static void g_part_bsd_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_bsd_entry *entry; entry = (struct g_part_bsd_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs BSD xt %u", entry->part.p_fstype); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->part.p_fstype); } else { /* confxml: scheme information */ } } static int g_part_bsd_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_bsd_entry *entry; /* Allow dumping to a swap partition or an unused partition. */ entry = (struct g_part_bsd_entry *)baseentry; return ((entry->part.p_fstype == FS_UNUSED || entry->part.p_fstype == FS_SWAP) ? 1 : 0); } static int g_part_bsd_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (bsd_parse_type(gpp->gpp_type, &entry->part.p_fstype)); return (0); } static void bsd_set_rawsize(struct g_part_table *basetable, struct g_provider *pp) { struct g_part_bsd_table *table; struct g_part_bsd_entry *entry; struct g_part_entry *baseentry; uint32_t msize; table = (struct g_part_bsd_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); le32enc(table->bbarea + pp->sectorsize + 60, msize); /* d_secperunit */ basetable->gpt_last = msize - 1; LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_index != RAW_PART + 1) continue; baseentry->gpe_end = basetable->gpt_last; entry = (struct g_part_bsd_entry *)baseentry; entry->part.p_size = msize; return; } } static int g_part_bsd_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd_entry *entry; struct g_provider *pp; if (baseentry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; bsd_set_rawsize(basetable, pp); return (0); } entry = (struct g_part_bsd_entry *)baseentry; baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1; entry->part.p_size = gpp->gpp_size; return (0); } static const char * g_part_bsd_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "%c", 'a' + baseentry->gpe_index - 1); return (buf); } static int g_part_bsd_probe(struct g_part_table *table, struct g_consumer *cp) { struct g_provider *pp; u_char *buf; uint32_t magic1, magic2; int error; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize < sizeof(struct disklabel) || pp->mediasize < BBSIZE) return (ENOSPC); if (BBSIZE % pp->sectorsize) return (ENOTBLK); /* Check that there's a disklabel. */ buf = g_read_data(cp, pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) return (error); magic1 = le32dec(buf + 0); magic2 = le32dec(buf + 132); g_free(buf); return ((magic1 == DISKMAGIC && magic2 == DISKMAGIC) ? G_PART_PROBE_PRI_HIGH : ENXIO); } static int g_part_bsd_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_bsd_table *table; struct g_part_entry *baseentry; struct g_part_bsd_entry *entry; struct partition part; u_char *buf, *p; off_t chs, msize; u_int sectors, heads; int error, index; pp = cp->provider; table = (struct g_part_bsd_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); table->bbarea = g_read_data(cp, 0, BBSIZE, &error); if (table->bbarea == NULL) return (error); buf = table->bbarea + pp->sectorsize; if (le32dec(buf + 40) != pp->sectorsize) goto invalid_label; sectors = le32dec(buf + 44); if (sectors < 1 || sectors > 255) goto invalid_label; if (sectors != basetable->gpt_sectors && !basetable->gpt_fixgeom) { g_part_geometry_heads(msize, sectors, &chs, &heads); if (chs != 0) { basetable->gpt_sectors = sectors; basetable->gpt_heads = heads; } } heads = le32dec(buf + 48); if (heads < 1 || heads > 255) goto invalid_label; if (heads != basetable->gpt_heads && !basetable->gpt_fixgeom) basetable->gpt_heads = heads; chs = le32dec(buf + 60); if (chs < 1) goto invalid_label; /* Fix-up a sysinstall bug. */ if (chs > msize) { chs = msize; le32enc(buf + 60, msize); } basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_isleaf = 1; basetable->gpt_entries = le16dec(buf + 138); if (basetable->gpt_entries < g_part_bsd_scheme.gps_minent || basetable->gpt_entries > g_part_bsd_scheme.gps_maxent) goto invalid_label; table->offset = le32dec(buf + 148 + RAW_PART * 16 + 4); for (index = basetable->gpt_entries - 1; index >= 0; index--) { p = buf + 148 + index * 16; part.p_size = le32dec(p + 0); part.p_offset = le32dec(p + 4); part.p_fsize = le32dec(p + 8); part.p_fstype = p[12]; part.p_frag = p[13]; part.p_cpg = le16dec(p + 14); if (part.p_size == 0) continue; if (part.p_offset < table->offset) continue; if (part.p_offset - table->offset > basetable->gpt_last) goto invalid_label; baseentry = g_part_new_entry(basetable, index + 1, part.p_offset - table->offset, part.p_offset - table->offset + part.p_size - 1); entry = (struct g_part_bsd_entry *)baseentry; entry->part = part; if (index == RAW_PART) baseentry->gpe_internal = 1; } return (0); invalid_label: printf("GEOM: %s: invalid disklabel.\n", pp->name); g_free(table->bbarea); table->bbarea = NULL; return (EINVAL); } static const char * g_part_bsd_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_bsd_entry *entry; int type; entry = (struct g_part_bsd_entry *)baseentry; type = entry->part.p_fstype; if (type == FS_NANDFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS)); if (type == FS_SWAP) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP)); if (type == FS_BSDFFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS)); if (type == FS_VINUM) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM)); if (type == FS_ZFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS)); snprintf(buf, bufsz, "!%d", type); return (buf); } static int g_part_bsd_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_bsd_entry *entry; struct g_part_bsd_table *table; uint16_t sum; u_char *label, *p, *pe; int error, index; pp = cp->provider; table = (struct g_part_bsd_table *)basetable; baseentry = LIST_FIRST(&basetable->gpt_entry); label = table->bbarea + pp->sectorsize; for (index = 1; index <= basetable->gpt_entries; index++) { p = label + 148 + (index - 1) * 16; entry = (baseentry != NULL && index == baseentry->gpe_index) ? (struct g_part_bsd_entry *)baseentry : NULL; if (entry != NULL && !baseentry->gpe_deleted) { le32enc(p + 0, entry->part.p_size); le32enc(p + 4, entry->part.p_offset); le32enc(p + 8, entry->part.p_fsize); p[12] = entry->part.p_fstype; p[13] = entry->part.p_frag; le16enc(p + 14, entry->part.p_cpg); } else bzero(p, 16); if (entry != NULL) baseentry = LIST_NEXT(baseentry, gpe_entry); } /* Calculate checksum. */ le16enc(label + 136, 0); pe = label + 148 + basetable->gpt_entries * 16; sum = 0; for (p = label; p < pe; p += 2) sum ^= le16dec(p); le16enc(label + 136, sum); error = g_write_data(cp, 0, table->bbarea, BBSIZE); return (error); } Index: head/sys/geom/part/g_part_ebr.c =================================================================== --- head/sys/geom/part/g_part_ebr.c (revision 326269) +++ head/sys/geom/part/g_part_ebr.c (revision 326270) @@ -1,694 +1,696 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007-2009 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_geom.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_ebr, "GEOM partitioning class for extended boot records support"); #if defined(GEOM_PART_EBR_COMPAT) FEATURE(geom_part_ebr_compat, "GEOM EBR partitioning class: backward-compatible partition names"); #endif #define EBRSIZE 512 struct g_part_ebr_table { struct g_part_table base; #ifndef GEOM_PART_EBR_COMPAT u_char ebr[EBRSIZE]; #endif }; struct g_part_ebr_entry { struct g_part_entry base; struct dos_partition ent; }; static int g_part_ebr_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_ebr_create(struct g_part_table *, struct g_part_parms *); static int g_part_ebr_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_ebr_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_ebr_dumpto(struct g_part_table *, struct g_part_entry *); #if defined(GEOM_PART_EBR_COMPAT) static void g_part_ebr_fullname(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); #endif static int g_part_ebr_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_ebr_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ebr_precheck(struct g_part_table *, enum g_part_ctl, struct g_part_parms *); static int g_part_ebr_probe(struct g_part_table *, struct g_consumer *); static int g_part_ebr_read(struct g_part_table *, struct g_consumer *); static int g_part_ebr_setunset(struct g_part_table *, struct g_part_entry *, const char *, unsigned int); static const char *g_part_ebr_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ebr_write(struct g_part_table *, struct g_consumer *); static int g_part_ebr_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_ebr_methods[] = { KOBJMETHOD(g_part_add, g_part_ebr_add), KOBJMETHOD(g_part_create, g_part_ebr_create), KOBJMETHOD(g_part_destroy, g_part_ebr_destroy), KOBJMETHOD(g_part_dumpconf, g_part_ebr_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_ebr_dumpto), #if defined(GEOM_PART_EBR_COMPAT) KOBJMETHOD(g_part_fullname, g_part_ebr_fullname), #endif KOBJMETHOD(g_part_modify, g_part_ebr_modify), KOBJMETHOD(g_part_name, g_part_ebr_name), KOBJMETHOD(g_part_precheck, g_part_ebr_precheck), KOBJMETHOD(g_part_probe, g_part_ebr_probe), KOBJMETHOD(g_part_read, g_part_ebr_read), KOBJMETHOD(g_part_resize, g_part_ebr_resize), KOBJMETHOD(g_part_setunset, g_part_ebr_setunset), KOBJMETHOD(g_part_type, g_part_ebr_type), KOBJMETHOD(g_part_write, g_part_ebr_write), { 0, 0 } }; static struct g_part_scheme g_part_ebr_scheme = { "EBR", g_part_ebr_methods, sizeof(struct g_part_ebr_table), .gps_entrysz = sizeof(struct g_part_ebr_entry), .gps_minent = 1, .gps_maxent = INT_MAX, }; G_PART_SCHEME_DECLARE(g_part_ebr); static struct g_part_ebr_alias { u_char typ; int alias; } ebr_alias_match[] = { { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, }; static void ebr_set_chs(struct g_part_table *, uint32_t, u_char *, u_char *, u_char *); static void ebr_entry_decode(const char *p, struct dos_partition *ent) { ent->dp_flag = p[0]; ent->dp_shd = p[1]; ent->dp_ssect = p[2]; ent->dp_scyl = p[3]; ent->dp_typ = p[4]; ent->dp_ehd = p[5]; ent->dp_esect = p[6]; ent->dp_ecyl = p[7]; ent->dp_start = le32dec(p + 8); ent->dp_size = le32dec(p + 12); } static void ebr_entry_link(struct g_part_table *table, uint32_t start, uint32_t end, u_char *buf) { buf[0] = 0 /* dp_flag */; ebr_set_chs(table, start, &buf[3] /* dp_scyl */, &buf[1] /* dp_shd */, &buf[2] /* dp_ssect */); buf[4] = 5 /* dp_typ */; ebr_set_chs(table, end, &buf[7] /* dp_ecyl */, &buf[5] /* dp_ehd */, &buf[6] /* dp_esect */); le32enc(buf + 8, start); le32enc(buf + 12, end - start + 1); } static int ebr_parse_type(const char *type, u_char *dp_typ) { const char *alias; char *endp; long lt; int i; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 256) return (EINVAL); *dp_typ = (u_char)lt; return (0); } for (i = 0; i < nitems(ebr_alias_match); i++) { alias = g_part_alias_name(ebr_alias_match[i].alias); if (strcasecmp(type, alias) == 0) { *dp_typ = ebr_alias_match[i].typ; return (0); } } return (EINVAL); } static void ebr_set_chs(struct g_part_table *table, uint32_t lba, u_char *cylp, u_char *hdp, u_char *secp) { uint32_t cyl, hd, sec; sec = lba % table->gpt_sectors + 1; lba /= table->gpt_sectors; hd = lba % table->gpt_heads; lba /= table->gpt_heads; cyl = lba; if (cyl > 1023) sec = hd = cyl = ~0; *cylp = cyl & 0xff; *hdp = hd & 0xff; *secp = (sec & 0x3f) | ((cyl >> 2) & 0xc0); } static int ebr_align(struct g_part_table *basetable, uint32_t *start, uint32_t *size) { uint32_t sectors; sectors = basetable->gpt_sectors; if (*size < 2 * sectors) return (EINVAL); if (*start % sectors) { *size += (*start % sectors) - sectors; *start -= (*start % sectors) - sectors; } if (*size % sectors) *size -= (*size % sectors); if (*size < 2 * sectors) return (EINVAL); return (0); } static int g_part_ebr_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_ebr_entry *entry; uint32_t start, size; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; entry = (struct g_part_ebr_entry *)baseentry; start = gpp->gpp_start; size = gpp->gpp_size; if (ebr_align(basetable, &start, &size) != 0) return (EINVAL); if (baseentry->gpe_deleted) bzero(&entry->ent, sizeof(entry->ent)); KASSERT(baseentry->gpe_start <= start, ("%s", __func__)); KASSERT(baseentry->gpe_end >= start + size - 1, ("%s", __func__)); baseentry->gpe_index = (start / basetable->gpt_sectors) + 1; baseentry->gpe_offset = (off_t)(start + basetable->gpt_sectors) * pp->sectorsize; baseentry->gpe_start = start; baseentry->gpe_end = start + size - 1; entry->ent.dp_start = basetable->gpt_sectors; entry->ent.dp_size = size - basetable->gpt_sectors; ebr_set_chs(basetable, entry->ent.dp_start, &entry->ent.dp_scyl, &entry->ent.dp_shd, &entry->ent.dp_ssect); ebr_set_chs(basetable, baseentry->gpe_end, &entry->ent.dp_ecyl, &entry->ent.dp_ehd, &entry->ent.dp_esect); return (ebr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); } static int g_part_ebr_create(struct g_part_table *basetable, struct g_part_parms *gpp) { char type[64]; struct g_consumer *cp; struct g_provider *pp; uint32_t msize; int error; pp = gpp->gpp_provider; if (pp->sectorsize < EBRSIZE) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* Check that we have a parent and that it's a MBR. */ if (basetable->gpt_depth == 0) return (ENXIO); cp = LIST_FIRST(&pp->consumers); error = g_getattr("PART::scheme", cp, &type); if (error != 0) return (error); if (strcmp(type, "MBR") != 0) return (ENXIO); error = g_getattr("PART::type", cp, &type); if (error != 0) return (error); if (strcmp(type, "ebr") != 0) return (ENXIO); msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_entries = msize / basetable->gpt_sectors; return (0); } static int g_part_ebr_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { /* Wipe the first sector to clear the partitioning. */ basetable->gpt_smhead |= 1; return (0); } static void g_part_ebr_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_ebr_entry *entry; entry = (struct g_part_ebr_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs MBREXT xt %u", entry->ent.dp_typ); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->ent.dp_typ); if (entry->ent.dp_flag & 0x80) sbuf_printf(sb, "%sactive\n", indent); } else { /* confxml: scheme information */ } } static int g_part_ebr_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_ebr_entry *entry; /* Allow dumping to a FreeBSD partition or Linux swap partition only. */ entry = (struct g_part_ebr_entry *)baseentry; return ((entry->ent.dp_typ == DOSPTYP_386BSD || entry->ent.dp_typ == DOSPTYP_LINSWP) ? 1 : 0); } #if defined(GEOM_PART_EBR_COMPAT) static void g_part_ebr_fullname(struct g_part_table *table, struct g_part_entry *entry, struct sbuf *sb, const char *pfx) { struct g_part_entry *iter; u_int idx; idx = 5; LIST_FOREACH(iter, &table->gpt_entry, gpe_entry) { if (iter == entry) break; idx++; } sbuf_printf(sb, "%.*s%u", (int)strlen(pfx) - 1, pfx, idx); } #endif static int g_part_ebr_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_ebr_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_ebr_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (ebr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); return (0); } static int g_part_ebr_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_provider *pp; if (baseentry != NULL) return (EOPNOTSUPP); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; return (0); } static const char * g_part_ebr_name(struct g_part_table *table, struct g_part_entry *entry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "+%08u", entry->gpe_index); return (buf); } static int g_part_ebr_precheck(struct g_part_table *table, enum g_part_ctl req, struct g_part_parms *gpp) { #if defined(GEOM_PART_EBR_COMPAT) if (req == G_PART_CTL_DESTROY) return (0); return (ECANCELED); #else /* * The index is a function of the start of the partition. * This is not something the user can override, nor is it * something the common code will do right. We can set the * index now so that we get what we need. */ if (req == G_PART_CTL_ADD) gpp->gpp_index = (gpp->gpp_start / table->gpt_sectors) + 1; return (0); #endif } static int g_part_ebr_probe(struct g_part_table *table, struct g_consumer *cp) { char type[64]; struct g_provider *pp; u_char *buf, *p; int error, index, res; uint16_t magic; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize < EBRSIZE || pp->mediasize < pp->sectorsize) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* Check that we have a parent and that it's a MBR. */ if (table->gpt_depth == 0) return (ENXIO); error = g_getattr("PART::scheme", cp, &type); if (error != 0) return (error); if (strcmp(type, "MBR") != 0) return (ENXIO); /* Check that partition has type DOSPTYP_EBR. */ error = g_getattr("PART::type", cp, &type); if (error != 0) return (error); if (strcmp(type, "ebr") != 0) return (ENXIO); /* Check that there's a EBR. */ buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); /* We goto out on mismatch. */ res = ENXIO; magic = le16dec(buf + DOSMAGICOFFSET); if (magic != DOSMAGIC) goto out; for (index = 0; index < 2; index++) { p = buf + DOSPARTOFF + index * DOSPARTSIZE; if (p[0] != 0 && p[0] != 0x80) goto out; } res = G_PART_PROBE_PRI_NORM; out: g_free(buf); return (res); } static int g_part_ebr_read(struct g_part_table *basetable, struct g_consumer *cp) { struct dos_partition ent[2]; struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_ebr_table *table; struct g_part_ebr_entry *entry; u_char *buf; off_t ofs, msize; u_int lba; int error, index; pp = cp->provider; table = (struct g_part_ebr_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); lba = 0; while (1) { ofs = (off_t)lba * pp->sectorsize; buf = g_read_data(cp, ofs, pp->sectorsize, &error); if (buf == NULL) return (error); ebr_entry_decode(buf + DOSPARTOFF + 0 * DOSPARTSIZE, ent + 0); ebr_entry_decode(buf + DOSPARTOFF + 1 * DOSPARTSIZE, ent + 1); /* The 3rd & 4th entries should be zeroes. */ if (le64dec(buf + DOSPARTOFF + 2 * DOSPARTSIZE) + le64dec(buf + DOSPARTOFF + 3 * DOSPARTSIZE) != 0) { basetable->gpt_corrupt = 1; printf("GEOM: %s: invalid entries in the EBR ignored.\n", pp->name); } #ifndef GEOM_PART_EBR_COMPAT /* Save the first EBR, it can contain a boot code */ if (lba == 0) bcopy(buf, table->ebr, sizeof(table->ebr)); #endif g_free(buf); if (ent[0].dp_typ == 0) break; if (ent[0].dp_typ == 5 && ent[1].dp_typ == 0) { lba = ent[0].dp_start; continue; } index = (lba / basetable->gpt_sectors) + 1; baseentry = (struct g_part_entry *)g_part_new_entry(basetable, index, lba, lba + ent[0].dp_start + ent[0].dp_size - 1); baseentry->gpe_offset = (off_t)(lba + ent[0].dp_start) * pp->sectorsize; entry = (struct g_part_ebr_entry *)baseentry; entry->ent = ent[0]; if (ent[1].dp_typ == 0) break; lba = ent[1].dp_start; } basetable->gpt_entries = msize / basetable->gpt_sectors; basetable->gpt_first = 0; basetable->gpt_last = msize - 1; return (0); } static int g_part_ebr_setunset(struct g_part_table *table, struct g_part_entry *baseentry, const char *attrib, unsigned int set) { struct g_part_entry *iter; struct g_part_ebr_entry *entry; int changed; if (baseentry == NULL) return (ENODEV); if (strcasecmp(attrib, "active") != 0) return (EINVAL); /* Only one entry can have the active attribute. */ LIST_FOREACH(iter, &table->gpt_entry, gpe_entry) { if (iter->gpe_deleted) continue; changed = 0; entry = (struct g_part_ebr_entry *)iter; if (iter == baseentry) { if (set && (entry->ent.dp_flag & 0x80) == 0) { entry->ent.dp_flag |= 0x80; changed = 1; } else if (!set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } else { if (set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } if (changed && !iter->gpe_created) iter->gpe_modified = 1; } return (0); } static const char * g_part_ebr_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_ebr_entry *entry; int i; entry = (struct g_part_ebr_entry *)baseentry; for (i = 0; i < nitems(ebr_alias_match); i++) { if (ebr_alias_match[i].typ == entry->ent.dp_typ) return (g_part_alias_name(ebr_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->ent.dp_typ); return (buf); } static int g_part_ebr_write(struct g_part_table *basetable, struct g_consumer *cp) { #ifndef GEOM_PART_EBR_COMPAT struct g_part_ebr_table *table; #endif struct g_provider *pp; struct g_part_entry *baseentry, *next; struct g_part_ebr_entry *entry; u_char *buf; u_char *p; int error; pp = cp->provider; buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); #ifndef GEOM_PART_EBR_COMPAT table = (struct g_part_ebr_table *)basetable; bcopy(table->ebr, buf, DOSPARTOFF); #endif le16enc(buf + DOSMAGICOFFSET, DOSMAGIC); baseentry = LIST_FIRST(&basetable->gpt_entry); while (baseentry != NULL && baseentry->gpe_deleted) baseentry = LIST_NEXT(baseentry, gpe_entry); /* Wipe-out the first EBR when there are no slices. */ if (baseentry == NULL) { error = g_write_data(cp, 0, buf, pp->sectorsize); goto out; } /* * If the first partition is not in LBA 0, we need to * put a "link" EBR in LBA 0. */ if (baseentry->gpe_start != 0) { ebr_entry_link(basetable, (uint32_t)baseentry->gpe_start, (uint32_t)baseentry->gpe_end, buf + DOSPARTOFF); error = g_write_data(cp, 0, buf, pp->sectorsize); if (error) goto out; } do { entry = (struct g_part_ebr_entry *)baseentry; p = buf + DOSPARTOFF; p[0] = entry->ent.dp_flag; p[1] = entry->ent.dp_shd; p[2] = entry->ent.dp_ssect; p[3] = entry->ent.dp_scyl; p[4] = entry->ent.dp_typ; p[5] = entry->ent.dp_ehd; p[6] = entry->ent.dp_esect; p[7] = entry->ent.dp_ecyl; le32enc(p + 8, entry->ent.dp_start); le32enc(p + 12, entry->ent.dp_size); next = LIST_NEXT(baseentry, gpe_entry); while (next != NULL && next->gpe_deleted) next = LIST_NEXT(next, gpe_entry); p += DOSPARTSIZE; if (next != NULL) ebr_entry_link(basetable, (uint32_t)next->gpe_start, (uint32_t)next->gpe_end, p); else bzero(p, DOSPARTSIZE); error = g_write_data(cp, baseentry->gpe_start * pp->sectorsize, buf, pp->sectorsize); #ifndef GEOM_PART_EBR_COMPAT if (baseentry->gpe_start == 0) bzero(buf, DOSPARTOFF); #endif baseentry = next; } while (!error && baseentry != NULL); out: g_free(buf); return (error); } Index: head/sys/geom/part/g_part_gpt.c =================================================================== --- head/sys/geom/part/g_part_gpt.c (revision 326269) +++ head/sys/geom/part/g_part_gpt.c (revision 326270) @@ -1,1398 +1,1400 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2002, 2005-2007, 2011 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_gpt, "GEOM partitioning class for GPT partitions support"); CTASSERT(offsetof(struct gpt_hdr, padding) == 92); CTASSERT(sizeof(struct gpt_ent) == 128); #define EQUUID(a,b) (memcmp(a, b, sizeof(struct uuid)) == 0) #define MBRSIZE 512 enum gpt_elt { GPT_ELT_PRIHDR, GPT_ELT_PRITBL, GPT_ELT_SECHDR, GPT_ELT_SECTBL, GPT_ELT_COUNT }; enum gpt_state { GPT_STATE_UNKNOWN, /* Not determined. */ GPT_STATE_MISSING, /* No signature found. */ GPT_STATE_CORRUPT, /* Checksum mismatch. */ GPT_STATE_INVALID, /* Nonconformant/invalid. */ GPT_STATE_OK /* Perfectly fine. */ }; struct g_part_gpt_table { struct g_part_table base; u_char mbr[MBRSIZE]; struct gpt_hdr *hdr; quad_t lba[GPT_ELT_COUNT]; enum gpt_state state[GPT_ELT_COUNT]; int bootcamp; }; struct g_part_gpt_entry { struct g_part_entry base; struct gpt_ent ent; }; static void g_gpt_printf_utf16(struct sbuf *, uint16_t *, size_t); static void g_gpt_utf8_to_utf16(const uint8_t *, uint16_t *, size_t); static void g_gpt_set_defaults(struct g_part_table *, struct g_provider *); static int g_part_gpt_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_gpt_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_gpt_create(struct g_part_table *, struct g_part_parms *); static int g_part_gpt_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_gpt_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_gpt_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_gpt_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_gpt_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_gpt_probe(struct g_part_table *, struct g_consumer *); static int g_part_gpt_read(struct g_part_table *, struct g_consumer *); static int g_part_gpt_setunset(struct g_part_table *table, struct g_part_entry *baseentry, const char *attrib, unsigned int set); static const char *g_part_gpt_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_gpt_write(struct g_part_table *, struct g_consumer *); static int g_part_gpt_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_gpt_recover(struct g_part_table *); static kobj_method_t g_part_gpt_methods[] = { KOBJMETHOD(g_part_add, g_part_gpt_add), KOBJMETHOD(g_part_bootcode, g_part_gpt_bootcode), KOBJMETHOD(g_part_create, g_part_gpt_create), KOBJMETHOD(g_part_destroy, g_part_gpt_destroy), KOBJMETHOD(g_part_dumpconf, g_part_gpt_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_gpt_dumpto), KOBJMETHOD(g_part_modify, g_part_gpt_modify), KOBJMETHOD(g_part_resize, g_part_gpt_resize), KOBJMETHOD(g_part_name, g_part_gpt_name), KOBJMETHOD(g_part_probe, g_part_gpt_probe), KOBJMETHOD(g_part_read, g_part_gpt_read), KOBJMETHOD(g_part_recover, g_part_gpt_recover), KOBJMETHOD(g_part_setunset, g_part_gpt_setunset), KOBJMETHOD(g_part_type, g_part_gpt_type), KOBJMETHOD(g_part_write, g_part_gpt_write), { 0, 0 } }; static struct g_part_scheme g_part_gpt_scheme = { "GPT", g_part_gpt_methods, sizeof(struct g_part_gpt_table), .gps_entrysz = sizeof(struct g_part_gpt_entry), .gps_minent = 128, .gps_maxent = 4096, .gps_bootcodesz = MBRSIZE, }; G_PART_SCHEME_DECLARE(g_part_gpt); static struct uuid gpt_uuid_apple_boot = GPT_ENT_TYPE_APPLE_BOOT; static struct uuid gpt_uuid_apple_core_storage = GPT_ENT_TYPE_APPLE_CORE_STORAGE; static struct uuid gpt_uuid_apple_hfs = GPT_ENT_TYPE_APPLE_HFS; static struct uuid gpt_uuid_apple_label = GPT_ENT_TYPE_APPLE_LABEL; static struct uuid gpt_uuid_apple_raid = GPT_ENT_TYPE_APPLE_RAID; static struct uuid gpt_uuid_apple_raid_offline = GPT_ENT_TYPE_APPLE_RAID_OFFLINE; static struct uuid gpt_uuid_apple_tv_recovery = GPT_ENT_TYPE_APPLE_TV_RECOVERY; static struct uuid gpt_uuid_apple_ufs = GPT_ENT_TYPE_APPLE_UFS; static struct uuid gpt_uuid_bios_boot = GPT_ENT_TYPE_BIOS_BOOT; static struct uuid gpt_uuid_chromeos_firmware = GPT_ENT_TYPE_CHROMEOS_FIRMWARE; static struct uuid gpt_uuid_chromeos_kernel = GPT_ENT_TYPE_CHROMEOS_KERNEL; static struct uuid gpt_uuid_chromeos_reserved = GPT_ENT_TYPE_CHROMEOS_RESERVED; static struct uuid gpt_uuid_chromeos_root = GPT_ENT_TYPE_CHROMEOS_ROOT; static struct uuid gpt_uuid_dfbsd_ccd = GPT_ENT_TYPE_DRAGONFLY_CCD; static struct uuid gpt_uuid_dfbsd_hammer = GPT_ENT_TYPE_DRAGONFLY_HAMMER; static struct uuid gpt_uuid_dfbsd_hammer2 = GPT_ENT_TYPE_DRAGONFLY_HAMMER2; static struct uuid gpt_uuid_dfbsd_label32 = GPT_ENT_TYPE_DRAGONFLY_LABEL32; static struct uuid gpt_uuid_dfbsd_label64 = GPT_ENT_TYPE_DRAGONFLY_LABEL64; static struct uuid gpt_uuid_dfbsd_legacy = GPT_ENT_TYPE_DRAGONFLY_LEGACY; static struct uuid gpt_uuid_dfbsd_swap = GPT_ENT_TYPE_DRAGONFLY_SWAP; static struct uuid gpt_uuid_dfbsd_ufs1 = GPT_ENT_TYPE_DRAGONFLY_UFS1; static struct uuid gpt_uuid_dfbsd_vinum = GPT_ENT_TYPE_DRAGONFLY_VINUM; static struct uuid gpt_uuid_efi = GPT_ENT_TYPE_EFI; static struct uuid gpt_uuid_freebsd = GPT_ENT_TYPE_FREEBSD; static struct uuid gpt_uuid_freebsd_boot = GPT_ENT_TYPE_FREEBSD_BOOT; static struct uuid gpt_uuid_freebsd_nandfs = GPT_ENT_TYPE_FREEBSD_NANDFS; static struct uuid gpt_uuid_freebsd_swap = GPT_ENT_TYPE_FREEBSD_SWAP; static struct uuid gpt_uuid_freebsd_ufs = GPT_ENT_TYPE_FREEBSD_UFS; static struct uuid gpt_uuid_freebsd_vinum = GPT_ENT_TYPE_FREEBSD_VINUM; static struct uuid gpt_uuid_freebsd_zfs = GPT_ENT_TYPE_FREEBSD_ZFS; static struct uuid gpt_uuid_linux_data = GPT_ENT_TYPE_LINUX_DATA; static struct uuid gpt_uuid_linux_lvm = GPT_ENT_TYPE_LINUX_LVM; static struct uuid gpt_uuid_linux_raid = GPT_ENT_TYPE_LINUX_RAID; static struct uuid gpt_uuid_linux_swap = GPT_ENT_TYPE_LINUX_SWAP; static struct uuid gpt_uuid_mbr = GPT_ENT_TYPE_MBR; static struct uuid gpt_uuid_ms_basic_data = GPT_ENT_TYPE_MS_BASIC_DATA; static struct uuid gpt_uuid_ms_ldm_data = GPT_ENT_TYPE_MS_LDM_DATA; static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA; static struct uuid gpt_uuid_ms_recovery = GPT_ENT_TYPE_MS_RECOVERY; static struct uuid gpt_uuid_ms_reserved = GPT_ENT_TYPE_MS_RESERVED; static struct uuid gpt_uuid_ms_spaces = GPT_ENT_TYPE_MS_SPACES; static struct uuid gpt_uuid_netbsd_ccd = GPT_ENT_TYPE_NETBSD_CCD; static struct uuid gpt_uuid_netbsd_cgd = GPT_ENT_TYPE_NETBSD_CGD; static struct uuid gpt_uuid_netbsd_ffs = GPT_ENT_TYPE_NETBSD_FFS; static struct uuid gpt_uuid_netbsd_lfs = GPT_ENT_TYPE_NETBSD_LFS; static struct uuid gpt_uuid_netbsd_raid = GPT_ENT_TYPE_NETBSD_RAID; static struct uuid gpt_uuid_netbsd_swap = GPT_ENT_TYPE_NETBSD_SWAP; static struct uuid gpt_uuid_openbsd_data = GPT_ENT_TYPE_OPENBSD_DATA; static struct uuid gpt_uuid_prep_boot = GPT_ENT_TYPE_PREP_BOOT; static struct uuid gpt_uuid_unused = GPT_ENT_TYPE_UNUSED; static struct uuid gpt_uuid_vmfs = GPT_ENT_TYPE_VMFS; static struct uuid gpt_uuid_vmkdiag = GPT_ENT_TYPE_VMKDIAG; static struct uuid gpt_uuid_vmreserved = GPT_ENT_TYPE_VMRESERVED; static struct uuid gpt_uuid_vmvsanhdr = GPT_ENT_TYPE_VMVSANHDR; static struct g_part_uuid_alias { struct uuid *uuid; int alias; int mbrtype; } gpt_uuid_alias_match[] = { { &gpt_uuid_apple_boot, G_PART_ALIAS_APPLE_BOOT, 0xab }, { &gpt_uuid_apple_core_storage, G_PART_ALIAS_APPLE_CORE_STORAGE, 0 }, { &gpt_uuid_apple_hfs, G_PART_ALIAS_APPLE_HFS, 0xaf }, { &gpt_uuid_apple_label, G_PART_ALIAS_APPLE_LABEL, 0 }, { &gpt_uuid_apple_raid, G_PART_ALIAS_APPLE_RAID, 0 }, { &gpt_uuid_apple_raid_offline, G_PART_ALIAS_APPLE_RAID_OFFLINE, 0 }, { &gpt_uuid_apple_tv_recovery, G_PART_ALIAS_APPLE_TV_RECOVERY, 0 }, { &gpt_uuid_apple_ufs, G_PART_ALIAS_APPLE_UFS, 0 }, { &gpt_uuid_bios_boot, G_PART_ALIAS_BIOS_BOOT, 0 }, { &gpt_uuid_chromeos_firmware, G_PART_ALIAS_CHROMEOS_FIRMWARE, 0 }, { &gpt_uuid_chromeos_kernel, G_PART_ALIAS_CHROMEOS_KERNEL, 0 }, { &gpt_uuid_chromeos_reserved, G_PART_ALIAS_CHROMEOS_RESERVED, 0 }, { &gpt_uuid_chromeos_root, G_PART_ALIAS_CHROMEOS_ROOT, 0 }, { &gpt_uuid_dfbsd_ccd, G_PART_ALIAS_DFBSD_CCD, 0 }, { &gpt_uuid_dfbsd_hammer, G_PART_ALIAS_DFBSD_HAMMER, 0 }, { &gpt_uuid_dfbsd_hammer2, G_PART_ALIAS_DFBSD_HAMMER2, 0 }, { &gpt_uuid_dfbsd_label32, G_PART_ALIAS_DFBSD, 0xa5 }, { &gpt_uuid_dfbsd_label64, G_PART_ALIAS_DFBSD64, 0xa5 }, { &gpt_uuid_dfbsd_legacy, G_PART_ALIAS_DFBSD_LEGACY, 0 }, { &gpt_uuid_dfbsd_swap, G_PART_ALIAS_DFBSD_SWAP, 0 }, { &gpt_uuid_dfbsd_ufs1, G_PART_ALIAS_DFBSD_UFS, 0 }, { &gpt_uuid_dfbsd_vinum, G_PART_ALIAS_DFBSD_VINUM, 0 }, { &gpt_uuid_efi, G_PART_ALIAS_EFI, 0xee }, { &gpt_uuid_freebsd, G_PART_ALIAS_FREEBSD, 0xa5 }, { &gpt_uuid_freebsd_boot, G_PART_ALIAS_FREEBSD_BOOT, 0 }, { &gpt_uuid_freebsd_nandfs, G_PART_ALIAS_FREEBSD_NANDFS, 0 }, { &gpt_uuid_freebsd_swap, G_PART_ALIAS_FREEBSD_SWAP, 0 }, { &gpt_uuid_freebsd_ufs, G_PART_ALIAS_FREEBSD_UFS, 0 }, { &gpt_uuid_freebsd_vinum, G_PART_ALIAS_FREEBSD_VINUM, 0 }, { &gpt_uuid_freebsd_zfs, G_PART_ALIAS_FREEBSD_ZFS, 0 }, { &gpt_uuid_linux_data, G_PART_ALIAS_LINUX_DATA, 0x0b }, { &gpt_uuid_linux_lvm, G_PART_ALIAS_LINUX_LVM, 0 }, { &gpt_uuid_linux_raid, G_PART_ALIAS_LINUX_RAID, 0 }, { &gpt_uuid_linux_swap, G_PART_ALIAS_LINUX_SWAP, 0 }, { &gpt_uuid_mbr, G_PART_ALIAS_MBR, 0 }, { &gpt_uuid_ms_basic_data, G_PART_ALIAS_MS_BASIC_DATA, 0x0b }, { &gpt_uuid_ms_ldm_data, G_PART_ALIAS_MS_LDM_DATA, 0 }, { &gpt_uuid_ms_ldm_metadata, G_PART_ALIAS_MS_LDM_METADATA, 0 }, { &gpt_uuid_ms_recovery, G_PART_ALIAS_MS_RECOVERY, 0 }, { &gpt_uuid_ms_reserved, G_PART_ALIAS_MS_RESERVED, 0 }, { &gpt_uuid_ms_spaces, G_PART_ALIAS_MS_SPACES, 0 }, { &gpt_uuid_netbsd_ccd, G_PART_ALIAS_NETBSD_CCD, 0 }, { &gpt_uuid_netbsd_cgd, G_PART_ALIAS_NETBSD_CGD, 0 }, { &gpt_uuid_netbsd_ffs, G_PART_ALIAS_NETBSD_FFS, 0 }, { &gpt_uuid_netbsd_lfs, G_PART_ALIAS_NETBSD_LFS, 0 }, { &gpt_uuid_netbsd_raid, G_PART_ALIAS_NETBSD_RAID, 0 }, { &gpt_uuid_netbsd_swap, G_PART_ALIAS_NETBSD_SWAP, 0 }, { &gpt_uuid_openbsd_data, G_PART_ALIAS_OPENBSD_DATA, 0 }, { &gpt_uuid_prep_boot, G_PART_ALIAS_PREP_BOOT, 0x41 }, { &gpt_uuid_vmfs, G_PART_ALIAS_VMFS, 0 }, { &gpt_uuid_vmkdiag, G_PART_ALIAS_VMKDIAG, 0 }, { &gpt_uuid_vmreserved, G_PART_ALIAS_VMRESERVED, 0 }, { &gpt_uuid_vmvsanhdr, G_PART_ALIAS_VMVSANHDR, 0 }, { NULL, 0, 0 } }; static int gpt_write_mbr_entry(u_char *mbr, int idx, int typ, quad_t start, quad_t end) { if (typ == 0 || start > UINT32_MAX || end > UINT32_MAX) return (EINVAL); mbr += DOSPARTOFF + idx * DOSPARTSIZE; mbr[0] = 0; if (start == 1) { /* * Treat the PMBR partition specially to maximize * interoperability with BIOSes. */ mbr[1] = mbr[3] = 0; mbr[2] = 2; } else mbr[1] = mbr[2] = mbr[3] = 0xff; mbr[4] = typ; mbr[5] = mbr[6] = mbr[7] = 0xff; le32enc(mbr + 8, (uint32_t)start); le32enc(mbr + 12, (uint32_t)(end - start + 1)); return (0); } static int gpt_map_type(struct uuid *t) { struct g_part_uuid_alias *uap; for (uap = &gpt_uuid_alias_match[0]; uap->uuid; uap++) { if (EQUUID(t, uap->uuid)) return (uap->mbrtype); } return (0); } static void gpt_create_pmbr(struct g_part_gpt_table *table, struct g_provider *pp) { bzero(table->mbr + DOSPARTOFF, DOSPARTSIZE * NDOSPART); gpt_write_mbr_entry(table->mbr, 0, 0xee, 1, MIN(pp->mediasize / pp->sectorsize - 1, UINT32_MAX)); le16enc(table->mbr + DOSMAGICOFFSET, DOSMAGIC); } /* * Under Boot Camp the PMBR partition (type 0xEE) doesn't cover the * whole disk anymore. Rather, it covers the GPT table and the EFI * system partition only. This way the HFS+ partition and any FAT * partitions can be added to the MBR without creating an overlap. */ static int gpt_is_bootcamp(struct g_part_gpt_table *table, const char *provname) { uint8_t *p; p = table->mbr + DOSPARTOFF; if (p[4] != 0xee || le32dec(p + 8) != 1) return (0); p += DOSPARTSIZE; if (p[4] != 0xaf) return (0); printf("GEOM: %s: enabling Boot Camp\n", provname); return (1); } static void gpt_update_bootcamp(struct g_part_table *basetable, struct g_provider *pp) { struct g_part_entry *baseentry; struct g_part_gpt_entry *entry; struct g_part_gpt_table *table; int bootable, error, index, slices, typ; table = (struct g_part_gpt_table *)basetable; bootable = -1; for (index = 0; index < NDOSPART; index++) { if (table->mbr[DOSPARTOFF + DOSPARTSIZE * index]) bootable = index; } bzero(table->mbr + DOSPARTOFF, DOSPARTSIZE * NDOSPART); slices = 0; LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_deleted) continue; index = baseentry->gpe_index - 1; if (index >= NDOSPART) continue; entry = (struct g_part_gpt_entry *)baseentry; switch (index) { case 0: /* This must be the EFI system partition. */ if (!EQUUID(&entry->ent.ent_type, &gpt_uuid_efi)) goto disable; error = gpt_write_mbr_entry(table->mbr, index, 0xee, 1ull, entry->ent.ent_lba_end); break; case 1: /* This must be the HFS+ partition. */ if (!EQUUID(&entry->ent.ent_type, &gpt_uuid_apple_hfs)) goto disable; error = gpt_write_mbr_entry(table->mbr, index, 0xaf, entry->ent.ent_lba_start, entry->ent.ent_lba_end); break; default: typ = gpt_map_type(&entry->ent.ent_type); error = gpt_write_mbr_entry(table->mbr, index, typ, entry->ent.ent_lba_start, entry->ent.ent_lba_end); break; } if (error) continue; if (index == bootable) table->mbr[DOSPARTOFF + DOSPARTSIZE * index] = 0x80; slices |= 1 << index; } if ((slices & 3) == 3) return; disable: table->bootcamp = 0; gpt_create_pmbr(table, pp); } static struct gpt_hdr * gpt_read_hdr(struct g_part_gpt_table *table, struct g_consumer *cp, enum gpt_elt elt) { struct gpt_hdr *buf, *hdr; struct g_provider *pp; quad_t lba, last; int error; uint32_t crc, sz; pp = cp->provider; last = (pp->mediasize / pp->sectorsize) - 1; table->state[elt] = GPT_STATE_MISSING; /* * If the primary header is valid look for secondary * header in AlternateLBA, otherwise in the last medium's LBA. */ if (elt == GPT_ELT_SECHDR) { if (table->state[GPT_ELT_PRIHDR] != GPT_STATE_OK) table->lba[elt] = last; } else table->lba[elt] = 1; buf = g_read_data(cp, table->lba[elt] * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) return (NULL); hdr = NULL; if (memcmp(buf->hdr_sig, GPT_HDR_SIG, sizeof(buf->hdr_sig)) != 0) goto fail; table->state[elt] = GPT_STATE_CORRUPT; sz = le32toh(buf->hdr_size); if (sz < 92 || sz > pp->sectorsize) goto fail; hdr = g_malloc(sz, M_WAITOK | M_ZERO); bcopy(buf, hdr, sz); hdr->hdr_size = sz; crc = le32toh(buf->hdr_crc_self); buf->hdr_crc_self = 0; if (crc32(buf, sz) != crc) goto fail; hdr->hdr_crc_self = crc; table->state[elt] = GPT_STATE_INVALID; hdr->hdr_revision = le32toh(buf->hdr_revision); if (hdr->hdr_revision < GPT_HDR_REVISION) goto fail; hdr->hdr_lba_self = le64toh(buf->hdr_lba_self); if (hdr->hdr_lba_self != table->lba[elt]) goto fail; hdr->hdr_lba_alt = le64toh(buf->hdr_lba_alt); if (hdr->hdr_lba_alt == hdr->hdr_lba_self || hdr->hdr_lba_alt > last) goto fail; /* Check the managed area. */ hdr->hdr_lba_start = le64toh(buf->hdr_lba_start); if (hdr->hdr_lba_start < 2 || hdr->hdr_lba_start >= last) goto fail; hdr->hdr_lba_end = le64toh(buf->hdr_lba_end); if (hdr->hdr_lba_end < hdr->hdr_lba_start || hdr->hdr_lba_end >= last) goto fail; /* Check the table location and size of the table. */ hdr->hdr_entries = le32toh(buf->hdr_entries); hdr->hdr_entsz = le32toh(buf->hdr_entsz); if (hdr->hdr_entries == 0 || hdr->hdr_entsz < 128 || (hdr->hdr_entsz & 7) != 0) goto fail; hdr->hdr_lba_table = le64toh(buf->hdr_lba_table); if (hdr->hdr_lba_table < 2 || hdr->hdr_lba_table >= last) goto fail; if (hdr->hdr_lba_table >= hdr->hdr_lba_start && hdr->hdr_lba_table <= hdr->hdr_lba_end) goto fail; lba = hdr->hdr_lba_table + howmany(hdr->hdr_entries * hdr->hdr_entsz, pp->sectorsize) - 1; if (lba >= last) goto fail; if (lba >= hdr->hdr_lba_start && lba <= hdr->hdr_lba_end) goto fail; table->state[elt] = GPT_STATE_OK; le_uuid_dec(&buf->hdr_uuid, &hdr->hdr_uuid); hdr->hdr_crc_table = le32toh(buf->hdr_crc_table); /* save LBA for secondary header */ if (elt == GPT_ELT_PRIHDR) table->lba[GPT_ELT_SECHDR] = hdr->hdr_lba_alt; g_free(buf); return (hdr); fail: if (hdr != NULL) g_free(hdr); g_free(buf); return (NULL); } static struct gpt_ent * gpt_read_tbl(struct g_part_gpt_table *table, struct g_consumer *cp, enum gpt_elt elt, struct gpt_hdr *hdr) { struct g_provider *pp; struct gpt_ent *ent, *tbl; char *buf, *p; unsigned int idx, sectors, tblsz, size; int error; if (hdr == NULL) return (NULL); pp = cp->provider; table->lba[elt] = hdr->hdr_lba_table; table->state[elt] = GPT_STATE_MISSING; tblsz = hdr->hdr_entries * hdr->hdr_entsz; sectors = howmany(tblsz, pp->sectorsize); buf = g_malloc(sectors * pp->sectorsize, M_WAITOK | M_ZERO); for (idx = 0; idx < sectors; idx += MAXPHYS / pp->sectorsize) { size = (sectors - idx > MAXPHYS / pp->sectorsize) ? MAXPHYS: (sectors - idx) * pp->sectorsize; p = g_read_data(cp, (table->lba[elt] + idx) * pp->sectorsize, size, &error); if (p == NULL) { g_free(buf); return (NULL); } bcopy(p, buf + idx * pp->sectorsize, size); g_free(p); } table->state[elt] = GPT_STATE_CORRUPT; if (crc32(buf, tblsz) != hdr->hdr_crc_table) { g_free(buf); return (NULL); } table->state[elt] = GPT_STATE_OK; tbl = g_malloc(hdr->hdr_entries * sizeof(struct gpt_ent), M_WAITOK | M_ZERO); for (idx = 0, ent = tbl, p = buf; idx < hdr->hdr_entries; idx++, ent++, p += hdr->hdr_entsz) { le_uuid_dec(p, &ent->ent_type); le_uuid_dec(p + 16, &ent->ent_uuid); ent->ent_lba_start = le64dec(p + 32); ent->ent_lba_end = le64dec(p + 40); ent->ent_attr = le64dec(p + 48); /* Keep UTF-16 in little-endian. */ bcopy(p + 56, ent->ent_name, sizeof(ent->ent_name)); } g_free(buf); return (tbl); } static int gpt_matched_hdrs(struct gpt_hdr *pri, struct gpt_hdr *sec) { if (pri == NULL || sec == NULL) return (0); if (!EQUUID(&pri->hdr_uuid, &sec->hdr_uuid)) return (0); return ((pri->hdr_revision == sec->hdr_revision && pri->hdr_size == sec->hdr_size && pri->hdr_lba_start == sec->hdr_lba_start && pri->hdr_lba_end == sec->hdr_lba_end && pri->hdr_entries == sec->hdr_entries && pri->hdr_entsz == sec->hdr_entsz && pri->hdr_crc_table == sec->hdr_crc_table) ? 1 : 0); } static int gpt_parse_type(const char *type, struct uuid *uuid) { struct uuid tmp; const char *alias; int error; struct g_part_uuid_alias *uap; if (type[0] == '!') { error = parse_uuid(type + 1, &tmp); if (error) return (error); if (EQUUID(&tmp, &gpt_uuid_unused)) return (EINVAL); *uuid = tmp; return (0); } for (uap = &gpt_uuid_alias_match[0]; uap->uuid; uap++) { alias = g_part_alias_name(uap->alias); if (!strcasecmp(type, alias)) { *uuid = *uap->uuid; return (0); } } return (EINVAL); } static int g_part_gpt_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_gpt_entry *entry; int error; entry = (struct g_part_gpt_entry *)baseentry; error = gpt_parse_type(gpp->gpp_type, &entry->ent.ent_type); if (error) return (error); kern_uuidgen(&entry->ent.ent_uuid, 1); entry->ent.ent_lba_start = baseentry->gpe_start; entry->ent.ent_lba_end = baseentry->gpe_end; if (baseentry->gpe_deleted) { entry->ent.ent_attr = 0; bzero(entry->ent.ent_name, sizeof(entry->ent.ent_name)); } if (gpp->gpp_parms & G_PART_PARM_LABEL) g_gpt_utf8_to_utf16(gpp->gpp_label, entry->ent.ent_name, sizeof(entry->ent.ent_name) / sizeof(entry->ent.ent_name[0])); return (0); } static int g_part_gpt_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_gpt_table *table; size_t codesz; codesz = DOSPARTOFF; table = (struct g_part_gpt_table *)basetable; bzero(table->mbr, codesz); codesz = MIN(codesz, gpp->gpp_codesize); if (codesz > 0) bcopy(gpp->gpp_codeptr, table->mbr, codesz); return (0); } static int g_part_gpt_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_gpt_table *table; size_t tblsz; /* We don't nest, which means that our depth should be 0. */ if (basetable->gpt_depth != 0) return (ENXIO); table = (struct g_part_gpt_table *)basetable; pp = gpp->gpp_provider; tblsz = howmany(basetable->gpt_entries * sizeof(struct gpt_ent), pp->sectorsize); if (pp->sectorsize < MBRSIZE || pp->mediasize < (3 + 2 * tblsz + basetable->gpt_entries) * pp->sectorsize) return (ENOSPC); gpt_create_pmbr(table, pp); /* Allocate space for the header */ table->hdr = g_malloc(sizeof(struct gpt_hdr), M_WAITOK | M_ZERO); bcopy(GPT_HDR_SIG, table->hdr->hdr_sig, sizeof(table->hdr->hdr_sig)); table->hdr->hdr_revision = GPT_HDR_REVISION; table->hdr->hdr_size = offsetof(struct gpt_hdr, padding); kern_uuidgen(&table->hdr->hdr_uuid, 1); table->hdr->hdr_entries = basetable->gpt_entries; table->hdr->hdr_entsz = sizeof(struct gpt_ent); g_gpt_set_defaults(basetable, pp); return (0); } static int g_part_gpt_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_gpt_table *table; struct g_provider *pp; table = (struct g_part_gpt_table *)basetable; pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; g_free(table->hdr); table->hdr = NULL; /* * Wipe the first 2 sectors and last one to clear the partitioning. * Wipe sectors only if they have valid metadata. */ if (table->state[GPT_ELT_PRIHDR] == GPT_STATE_OK) basetable->gpt_smhead |= 3; if (table->state[GPT_ELT_SECHDR] == GPT_STATE_OK && table->lba[GPT_ELT_SECHDR] == pp->mediasize / pp->sectorsize - 1) basetable->gpt_smtail |= 1; return (0); } static void g_part_gpt_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_gpt_entry *entry; entry = (struct g_part_gpt_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs GPT xt "); sbuf_printf_uuid(sb, &entry->ent.ent_type); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s\n"); if (entry->ent.ent_attr & GPT_ENT_ATTR_BOOTME) sbuf_printf(sb, "%sbootme\n", indent); if (entry->ent.ent_attr & GPT_ENT_ATTR_BOOTONCE) { sbuf_printf(sb, "%sbootonce\n", indent); } if (entry->ent.ent_attr & GPT_ENT_ATTR_BOOTFAILED) { sbuf_printf(sb, "%sbootfailed\n", indent); } sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &entry->ent.ent_type); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s", indent); sbuf_printf_uuid(sb, &entry->ent.ent_uuid); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s", indent); sbuf_printf(sb, "HD(%d,GPT,", entry->base.gpe_index); sbuf_printf_uuid(sb, &entry->ent.ent_uuid); sbuf_printf(sb, ",%#jx,%#jx)", (intmax_t)entry->base.gpe_start, (intmax_t)(entry->base.gpe_end - entry->base.gpe_start + 1)); sbuf_printf(sb, "\n"); } else { /* confxml: scheme information */ } } static int g_part_gpt_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_gpt_entry *entry; entry = (struct g_part_gpt_entry *)baseentry; return ((EQUUID(&entry->ent.ent_type, &gpt_uuid_freebsd_swap) || EQUUID(&entry->ent.ent_type, &gpt_uuid_linux_swap) || EQUUID(&entry->ent.ent_type, &gpt_uuid_dfbsd_swap)) ? 1 : 0); } static int g_part_gpt_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_gpt_entry *entry; int error; entry = (struct g_part_gpt_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) { error = gpt_parse_type(gpp->gpp_type, &entry->ent.ent_type); if (error) return (error); } if (gpp->gpp_parms & G_PART_PARM_LABEL) g_gpt_utf8_to_utf16(gpp->gpp_label, entry->ent.ent_name, sizeof(entry->ent.ent_name) / sizeof(entry->ent.ent_name[0])); return (0); } static int g_part_gpt_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_gpt_entry *entry; if (baseentry == NULL) return (g_part_gpt_recover(basetable)); entry = (struct g_part_gpt_entry *)baseentry; baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1; entry->ent.ent_lba_end = baseentry->gpe_end; return (0); } static const char * g_part_gpt_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_gpt_entry *entry; char c; entry = (struct g_part_gpt_entry *)baseentry; c = (EQUUID(&entry->ent.ent_type, &gpt_uuid_freebsd)) ? 's' : 'p'; snprintf(buf, bufsz, "%c%d", c, baseentry->gpe_index); return (buf); } static int g_part_gpt_probe(struct g_part_table *table, struct g_consumer *cp) { struct g_provider *pp; u_char *buf; int error, index, pri, res; /* We don't nest, which means that our depth should be 0. */ if (table->gpt_depth != 0) return (ENXIO); pp = cp->provider; /* * Sanity-check the provider. Since the first sector on the provider * must be a PMBR and a PMBR is 512 bytes large, the sector size * must be at least 512 bytes. Also, since the theoretical minimum * number of sectors needed by GPT is 6, any medium that has less * than 6 sectors is never going to be able to hold a GPT. The * number 6 comes from: * 1 sector for the PMBR * 2 sectors for the GPT headers (each 1 sector) * 2 sectors for the GPT tables (each 1 sector) * 1 sector for an actual partition * It's better to catch this pathological case early than behaving * pathologically later on... */ if (pp->sectorsize < MBRSIZE || pp->mediasize < 6 * pp->sectorsize) return (ENOSPC); /* * Check that there's a MBR or a PMBR. If it's a PMBR, we return * as the highest priority on a match, otherwise we assume some * GPT-unaware tool has destroyed the GPT by recreating a MBR and * we really want the MBR scheme to take precedence. */ buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); res = le16dec(buf + DOSMAGICOFFSET); pri = G_PART_PROBE_PRI_LOW; if (res == DOSMAGIC) { for (index = 0; index < NDOSPART; index++) { if (buf[DOSPARTOFF + DOSPARTSIZE * index + 4] == 0xee) pri = G_PART_PROBE_PRI_HIGH; } g_free(buf); /* Check that there's a primary header. */ buf = g_read_data(cp, pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) return (error); res = memcmp(buf, GPT_HDR_SIG, 8); g_free(buf); if (res == 0) return (pri); } else g_free(buf); /* No primary? Check that there's a secondary. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) return (error); res = memcmp(buf, GPT_HDR_SIG, 8); g_free(buf); return ((res == 0) ? pri : ENXIO); } static int g_part_gpt_read(struct g_part_table *basetable, struct g_consumer *cp) { struct gpt_hdr *prihdr, *sechdr; struct gpt_ent *tbl, *pritbl, *sectbl; struct g_provider *pp; struct g_part_gpt_table *table; struct g_part_gpt_entry *entry; u_char *buf; uint64_t last; int error, index; table = (struct g_part_gpt_table *)basetable; pp = cp->provider; last = (pp->mediasize / pp->sectorsize) - 1; /* Read the PMBR */ buf = g_read_data(cp, 0, pp->sectorsize, &error); if (buf == NULL) return (error); bcopy(buf, table->mbr, MBRSIZE); g_free(buf); /* Read the primary header and table. */ prihdr = gpt_read_hdr(table, cp, GPT_ELT_PRIHDR); if (table->state[GPT_ELT_PRIHDR] == GPT_STATE_OK) { pritbl = gpt_read_tbl(table, cp, GPT_ELT_PRITBL, prihdr); } else { table->state[GPT_ELT_PRITBL] = GPT_STATE_MISSING; pritbl = NULL; } /* Read the secondary header and table. */ sechdr = gpt_read_hdr(table, cp, GPT_ELT_SECHDR); if (table->state[GPT_ELT_SECHDR] == GPT_STATE_OK) { sectbl = gpt_read_tbl(table, cp, GPT_ELT_SECTBL, sechdr); } else { table->state[GPT_ELT_SECTBL] = GPT_STATE_MISSING; sectbl = NULL; } /* Fail if we haven't got any good tables at all. */ if (table->state[GPT_ELT_PRITBL] != GPT_STATE_OK && table->state[GPT_ELT_SECTBL] != GPT_STATE_OK) { printf("GEOM: %s: corrupt or invalid GPT detected.\n", pp->name); printf("GEOM: %s: GPT rejected -- may not be recoverable.\n", pp->name); return (EINVAL); } /* * If both headers are good but they disagree with each other, * then invalidate one. We prefer to keep the primary header, * unless the primary table is corrupt. */ if (table->state[GPT_ELT_PRIHDR] == GPT_STATE_OK && table->state[GPT_ELT_SECHDR] == GPT_STATE_OK && !gpt_matched_hdrs(prihdr, sechdr)) { if (table->state[GPT_ELT_PRITBL] == GPT_STATE_OK) { table->state[GPT_ELT_SECHDR] = GPT_STATE_INVALID; table->state[GPT_ELT_SECTBL] = GPT_STATE_MISSING; g_free(sechdr); sechdr = NULL; } else { table->state[GPT_ELT_PRIHDR] = GPT_STATE_INVALID; table->state[GPT_ELT_PRITBL] = GPT_STATE_MISSING; g_free(prihdr); prihdr = NULL; } } if (table->state[GPT_ELT_PRITBL] != GPT_STATE_OK) { printf("GEOM: %s: the primary GPT table is corrupt or " "invalid.\n", pp->name); printf("GEOM: %s: using the secondary instead -- recovery " "strongly advised.\n", pp->name); table->hdr = sechdr; basetable->gpt_corrupt = 1; if (prihdr != NULL) g_free(prihdr); tbl = sectbl; if (pritbl != NULL) g_free(pritbl); } else { if (table->state[GPT_ELT_SECTBL] != GPT_STATE_OK) { printf("GEOM: %s: the secondary GPT table is corrupt " "or invalid.\n", pp->name); printf("GEOM: %s: using the primary only -- recovery " "suggested.\n", pp->name); basetable->gpt_corrupt = 1; } else if (table->lba[GPT_ELT_SECHDR] != last) { printf( "GEOM: %s: the secondary GPT header is not in " "the last LBA.\n", pp->name); basetable->gpt_corrupt = 1; } table->hdr = prihdr; if (sechdr != NULL) g_free(sechdr); tbl = pritbl; if (sectbl != NULL) g_free(sectbl); } basetable->gpt_first = table->hdr->hdr_lba_start; basetable->gpt_last = table->hdr->hdr_lba_end; basetable->gpt_entries = (table->hdr->hdr_lba_start - 2) * pp->sectorsize / table->hdr->hdr_entsz; for (index = table->hdr->hdr_entries - 1; index >= 0; index--) { if (EQUUID(&tbl[index].ent_type, &gpt_uuid_unused)) continue; entry = (struct g_part_gpt_entry *)g_part_new_entry( basetable, index + 1, tbl[index].ent_lba_start, tbl[index].ent_lba_end); entry->ent = tbl[index]; } g_free(tbl); /* * Under Mac OS X, the MBR mirrors the first 4 GPT partitions * if (and only if) any FAT32 or FAT16 partitions have been * created. This happens irrespective of whether Boot Camp is * used/enabled, though it's generally understood to be done * to support legacy Windows under Boot Camp. We refer to this * mirroring simply as Boot Camp. We try to detect Boot Camp * so that we can update the MBR if and when GPT changes have * been made. Note that we do not enable Boot Camp if not * previously enabled because we can't assume that we're on a * Mac alongside Mac OS X. */ table->bootcamp = gpt_is_bootcamp(table, pp->name); return (0); } static int g_part_gpt_recover(struct g_part_table *basetable) { struct g_part_gpt_table *table; struct g_provider *pp; table = (struct g_part_gpt_table *)basetable; pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; gpt_create_pmbr(table, pp); g_gpt_set_defaults(basetable, pp); basetable->gpt_corrupt = 0; return (0); } static int g_part_gpt_setunset(struct g_part_table *basetable, struct g_part_entry *baseentry, const char *attrib, unsigned int set) { struct g_part_gpt_entry *entry; struct g_part_gpt_table *table; struct g_provider *pp; uint8_t *p; uint64_t attr; int i; table = (struct g_part_gpt_table *)basetable; entry = (struct g_part_gpt_entry *)baseentry; if (strcasecmp(attrib, "active") == 0) { if (table->bootcamp) { /* The active flag must be set on a valid entry. */ if (entry == NULL) return (ENXIO); if (baseentry->gpe_index > NDOSPART) return (EINVAL); for (i = 0; i < NDOSPART; i++) { p = &table->mbr[DOSPARTOFF + i * DOSPARTSIZE]; p[0] = (i == baseentry->gpe_index - 1) ? ((set) ? 0x80 : 0) : 0; } } else { /* The PMBR is marked as active without an entry. */ if (entry != NULL) return (ENXIO); for (i = 0; i < NDOSPART; i++) { p = &table->mbr[DOSPARTOFF + i * DOSPARTSIZE]; p[0] = (p[4] == 0xee) ? ((set) ? 0x80 : 0) : 0; } } return (0); } else if (strcasecmp(attrib, "lenovofix") == 0) { /* * Write the 0xee GPT entry to slot #1 (2nd slot) in the pMBR. * This workaround allows Lenovo X220, T420, T520, etc to boot * from GPT Partitions in BIOS mode. */ if (entry != NULL) return (ENXIO); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; bzero(table->mbr + DOSPARTOFF, DOSPARTSIZE * NDOSPART); gpt_write_mbr_entry(table->mbr, ((set) ? 1 : 0), 0xee, 1, MIN(pp->mediasize / pp->sectorsize - 1, UINT32_MAX)); return (0); } if (entry == NULL) return (ENODEV); attr = 0; if (strcasecmp(attrib, "bootme") == 0) { attr |= GPT_ENT_ATTR_BOOTME; } else if (strcasecmp(attrib, "bootonce") == 0) { attr |= GPT_ENT_ATTR_BOOTONCE; if (set) attr |= GPT_ENT_ATTR_BOOTME; } else if (strcasecmp(attrib, "bootfailed") == 0) { /* * It should only be possible to unset BOOTFAILED, but it might * be useful for test purposes to also be able to set it. */ attr |= GPT_ENT_ATTR_BOOTFAILED; } if (attr == 0) return (EINVAL); if (set) attr = entry->ent.ent_attr | attr; else attr = entry->ent.ent_attr & ~attr; if (attr != entry->ent.ent_attr) { entry->ent.ent_attr = attr; if (!baseentry->gpe_created) baseentry->gpe_modified = 1; } return (0); } static const char * g_part_gpt_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_gpt_entry *entry; struct uuid *type; struct g_part_uuid_alias *uap; entry = (struct g_part_gpt_entry *)baseentry; type = &entry->ent.ent_type; for (uap = &gpt_uuid_alias_match[0]; uap->uuid; uap++) if (EQUUID(type, uap->uuid)) return (g_part_alias_name(uap->alias)); buf[0] = '!'; snprintf_uuid(buf + 1, bufsz - 1, type); return (buf); } static int g_part_gpt_write(struct g_part_table *basetable, struct g_consumer *cp) { unsigned char *buf, *bp; struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_gpt_entry *entry; struct g_part_gpt_table *table; size_t tblsz; uint32_t crc; int error, index; pp = cp->provider; table = (struct g_part_gpt_table *)basetable; tblsz = howmany(table->hdr->hdr_entries * table->hdr->hdr_entsz, pp->sectorsize); /* Reconstruct the MBR from the GPT if under Boot Camp. */ if (table->bootcamp) gpt_update_bootcamp(basetable, pp); /* Write the PMBR */ buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); bcopy(table->mbr, buf, MBRSIZE); error = g_write_data(cp, 0, buf, pp->sectorsize); g_free(buf); if (error) return (error); /* Allocate space for the header and entries. */ buf = g_malloc((tblsz + 1) * pp->sectorsize, M_WAITOK | M_ZERO); memcpy(buf, table->hdr->hdr_sig, sizeof(table->hdr->hdr_sig)); le32enc(buf + 8, table->hdr->hdr_revision); le32enc(buf + 12, table->hdr->hdr_size); le64enc(buf + 40, table->hdr->hdr_lba_start); le64enc(buf + 48, table->hdr->hdr_lba_end); le_uuid_enc(buf + 56, &table->hdr->hdr_uuid); le32enc(buf + 80, table->hdr->hdr_entries); le32enc(buf + 84, table->hdr->hdr_entsz); LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_deleted) continue; entry = (struct g_part_gpt_entry *)baseentry; index = baseentry->gpe_index - 1; bp = buf + pp->sectorsize + table->hdr->hdr_entsz * index; le_uuid_enc(bp, &entry->ent.ent_type); le_uuid_enc(bp + 16, &entry->ent.ent_uuid); le64enc(bp + 32, entry->ent.ent_lba_start); le64enc(bp + 40, entry->ent.ent_lba_end); le64enc(bp + 48, entry->ent.ent_attr); memcpy(bp + 56, entry->ent.ent_name, sizeof(entry->ent.ent_name)); } crc = crc32(buf + pp->sectorsize, table->hdr->hdr_entries * table->hdr->hdr_entsz); le32enc(buf + 88, crc); /* Write primary meta-data. */ le32enc(buf + 16, 0); /* hdr_crc_self. */ le64enc(buf + 24, table->lba[GPT_ELT_PRIHDR]); /* hdr_lba_self. */ le64enc(buf + 32, table->lba[GPT_ELT_SECHDR]); /* hdr_lba_alt. */ le64enc(buf + 72, table->lba[GPT_ELT_PRITBL]); /* hdr_lba_table. */ crc = crc32(buf, table->hdr->hdr_size); le32enc(buf + 16, crc); for (index = 0; index < tblsz; index += MAXPHYS / pp->sectorsize) { error = g_write_data(cp, (table->lba[GPT_ELT_PRITBL] + index) * pp->sectorsize, buf + (index + 1) * pp->sectorsize, (tblsz - index > MAXPHYS / pp->sectorsize) ? MAXPHYS: (tblsz - index) * pp->sectorsize); if (error) goto out; } error = g_write_data(cp, table->lba[GPT_ELT_PRIHDR] * pp->sectorsize, buf, pp->sectorsize); if (error) goto out; /* Write secondary meta-data. */ le32enc(buf + 16, 0); /* hdr_crc_self. */ le64enc(buf + 24, table->lba[GPT_ELT_SECHDR]); /* hdr_lba_self. */ le64enc(buf + 32, table->lba[GPT_ELT_PRIHDR]); /* hdr_lba_alt. */ le64enc(buf + 72, table->lba[GPT_ELT_SECTBL]); /* hdr_lba_table. */ crc = crc32(buf, table->hdr->hdr_size); le32enc(buf + 16, crc); for (index = 0; index < tblsz; index += MAXPHYS / pp->sectorsize) { error = g_write_data(cp, (table->lba[GPT_ELT_SECTBL] + index) * pp->sectorsize, buf + (index + 1) * pp->sectorsize, (tblsz - index > MAXPHYS / pp->sectorsize) ? MAXPHYS: (tblsz - index) * pp->sectorsize); if (error) goto out; } error = g_write_data(cp, table->lba[GPT_ELT_SECHDR] * pp->sectorsize, buf, pp->sectorsize); out: g_free(buf); return (error); } static void g_gpt_set_defaults(struct g_part_table *basetable, struct g_provider *pp) { struct g_part_entry *baseentry; struct g_part_gpt_entry *entry; struct g_part_gpt_table *table; quad_t start, end, min, max; quad_t lba, last; size_t spb, tblsz; table = (struct g_part_gpt_table *)basetable; last = pp->mediasize / pp->sectorsize - 1; tblsz = howmany(basetable->gpt_entries * sizeof(struct gpt_ent), pp->sectorsize); table->lba[GPT_ELT_PRIHDR] = 1; table->lba[GPT_ELT_PRITBL] = 2; table->lba[GPT_ELT_SECHDR] = last; table->lba[GPT_ELT_SECTBL] = last - tblsz; table->state[GPT_ELT_PRIHDR] = GPT_STATE_OK; table->state[GPT_ELT_PRITBL] = GPT_STATE_OK; table->state[GPT_ELT_SECHDR] = GPT_STATE_OK; table->state[GPT_ELT_SECTBL] = GPT_STATE_OK; max = start = 2 + tblsz; min = end = last - tblsz - 1; LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_deleted) continue; entry = (struct g_part_gpt_entry *)baseentry; if (entry->ent.ent_lba_start < min) min = entry->ent.ent_lba_start; if (entry->ent.ent_lba_end > max) max = entry->ent.ent_lba_end; } spb = 4096 / pp->sectorsize; if (spb > 1) { lba = start + ((start % spb) ? spb - start % spb : 0); if (lba <= min) start = lba; lba = end - (end + 1) % spb; if (max <= lba) end = lba; } table->hdr->hdr_lba_start = start; table->hdr->hdr_lba_end = end; basetable->gpt_first = start; basetable->gpt_last = end; } static void g_gpt_printf_utf16(struct sbuf *sb, uint16_t *str, size_t len) { u_int bo; uint32_t ch; uint16_t c; bo = LITTLE_ENDIAN; /* GPT is little-endian */ while (len > 0 && *str != 0) { ch = (bo == BIG_ENDIAN) ? be16toh(*str) : le16toh(*str); str++, len--; if ((ch & 0xf800) == 0xd800) { if (len > 0) { c = (bo == BIG_ENDIAN) ? be16toh(*str) : le16toh(*str); str++, len--; } else c = 0xfffd; if ((ch & 0x400) == 0 && (c & 0xfc00) == 0xdc00) { ch = ((ch & 0x3ff) << 10) + (c & 0x3ff); ch += 0x10000; } else ch = 0xfffd; } else if (ch == 0xfffe) { /* BOM (U+FEFF) swapped. */ bo = (bo == BIG_ENDIAN) ? LITTLE_ENDIAN : BIG_ENDIAN; continue; } else if (ch == 0xfeff) /* BOM (U+FEFF) unswapped. */ continue; /* Write the Unicode character in UTF-8 */ if (ch < 0x80) g_conf_printf_escaped(sb, "%c", ch); else if (ch < 0x800) g_conf_printf_escaped(sb, "%c%c", 0xc0 | (ch >> 6), 0x80 | (ch & 0x3f)); else if (ch < 0x10000) g_conf_printf_escaped(sb, "%c%c%c", 0xe0 | (ch >> 12), 0x80 | ((ch >> 6) & 0x3f), 0x80 | (ch & 0x3f)); else if (ch < 0x200000) g_conf_printf_escaped(sb, "%c%c%c%c", 0xf0 | (ch >> 18), 0x80 | ((ch >> 12) & 0x3f), 0x80 | ((ch >> 6) & 0x3f), 0x80 | (ch & 0x3f)); } } static void g_gpt_utf8_to_utf16(const uint8_t *s8, uint16_t *s16, size_t s16len) { size_t s16idx, s8idx; uint32_t utfchar; unsigned int c, utfbytes; s8idx = s16idx = 0; utfchar = 0; utfbytes = 0; bzero(s16, s16len << 1); while (s8[s8idx] != 0 && s16idx < s16len) { c = s8[s8idx++]; if ((c & 0xc0) != 0x80) { /* Initial characters. */ if (utfbytes != 0) { /* Incomplete encoding of previous char. */ s16[s16idx++] = htole16(0xfffd); } if ((c & 0xf8) == 0xf0) { utfchar = c & 0x07; utfbytes = 3; } else if ((c & 0xf0) == 0xe0) { utfchar = c & 0x0f; utfbytes = 2; } else if ((c & 0xe0) == 0xc0) { utfchar = c & 0x1f; utfbytes = 1; } else { utfchar = c & 0x7f; utfbytes = 0; } } else { /* Followup characters. */ if (utfbytes > 0) { utfchar = (utfchar << 6) + (c & 0x3f); utfbytes--; } else if (utfbytes == 0) utfbytes = ~0; } /* * Write the complete Unicode character as UTF-16 when we * have all the UTF-8 charactars collected. */ if (utfbytes == 0) { /* * If we need to write 2 UTF-16 characters, but * we only have room for 1, then we truncate the * string by writing a 0 instead. */ if (utfchar >= 0x10000 && s16idx < s16len - 1) { s16[s16idx++] = htole16(0xd800 | ((utfchar >> 10) - 0x40)); s16[s16idx++] = htole16(0xdc00 | (utfchar & 0x3ff)); } else s16[s16idx++] = (utfchar >= 0x10000) ? 0 : htole16(utfchar); } } /* * If our input string was truncated, append an invalid encoding * character to the output string. */ if (utfbytes != 0 && s16idx < s16len) s16[s16idx++] = htole16(0xfffd); } Index: head/sys/geom/part/g_part_ldm.c =================================================================== --- head/sys/geom/part/g_part_ldm.c (revision 326269) +++ head/sys/geom/part/g_part_ldm.c (revision 326270) @@ -1,1482 +1,1484 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_ldm, "GEOM partitioning class for LDM support"); SYSCTL_DECL(_kern_geom_part); static SYSCTL_NODE(_kern_geom_part, OID_AUTO, ldm, CTLFLAG_RW, 0, "GEOM_PART_LDM Logical Disk Manager"); static u_int ldm_debug = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, debug, CTLFLAG_RWTUN, &ldm_debug, 0, "Debug level"); /* * This allows access to mirrored LDM volumes. Since we do not * doing mirroring here, it is not enabled by default. */ static u_int show_mirrors = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, show_mirrors, CTLFLAG_RWTUN, &show_mirrors, 0, "Show mirrored volumes"); #define LDM_DEBUG(lvl, fmt, ...) do { \ if (ldm_debug >= (lvl)) { \ printf("GEOM_PART: " fmt "\n", __VA_ARGS__); \ } \ } while (0) #define LDM_DUMP(buf, size) do { \ if (ldm_debug > 1) { \ hexdump(buf, size, NULL, 0); \ } \ } while (0) /* * There are internal representations of LDM structures. * * We do not keep all fields of on-disk structures, only most useful. * All numbers in an on-disk structures are in big-endian format. */ /* * Private header is 512 bytes long. There are three copies on each disk. * Offset and sizes are in sectors. Location of each copy: * - the first offset is relative to the disk start; * - the second and third offset are relative to the LDM database start. * * On a disk partitioned with GPT, the LDM has not first private header. */ #define LDM_PH_MBRINDEX 0 #define LDM_PH_GPTINDEX 2 static const uint64_t ldm_ph_off[] = {6, 1856, 2047}; #define LDM_VERSION_2K 0x2000b #define LDM_VERSION_VISTA 0x2000c #define LDM_PH_VERSION_OFF 0x00c #define LDM_PH_DISKGUID_OFF 0x030 #define LDM_PH_DGGUID_OFF 0x0b0 #define LDM_PH_DGNAME_OFF 0x0f0 #define LDM_PH_START_OFF 0x11b #define LDM_PH_SIZE_OFF 0x123 #define LDM_PH_DB_OFF 0x12b #define LDM_PH_DBSIZE_OFF 0x133 #define LDM_PH_TH1_OFF 0x13b #define LDM_PH_TH2_OFF 0x143 #define LDM_PH_CONFSIZE_OFF 0x153 #define LDM_PH_LOGSIZE_OFF 0x15b #define LDM_PH_SIGN "PRIVHEAD" struct ldm_privhdr { struct uuid disk_guid; struct uuid dg_guid; u_char dg_name[32]; uint64_t start; /* logical disk start */ uint64_t size; /* logical disk size */ uint64_t db_offset; /* LDM database start */ #define LDM_DB_SIZE 2048 uint64_t db_size; /* LDM database size */ #define LDM_TH_COUNT 2 uint64_t th_offset[LDM_TH_COUNT]; /* TOC header offsets */ uint64_t conf_size; /* configuration size */ uint64_t log_size; /* size of log */ }; /* * Table of contents header is 512 bytes long. * There are two identical copies at offsets from the private header. * Offsets are relative to the LDM database start. */ #define LDM_TH_SIGN "TOCBLOCK" #define LDM_TH_NAME1 "config" #define LDM_TH_NAME2 "log" #define LDM_TH_NAME1_OFF 0x024 #define LDM_TH_CONF_OFF 0x02e #define LDM_TH_CONFSIZE_OFF 0x036 #define LDM_TH_NAME2_OFF 0x046 #define LDM_TH_LOG_OFF 0x050 #define LDM_TH_LOGSIZE_OFF 0x058 struct ldm_tochdr { uint64_t conf_offset; /* configuration offset */ uint64_t log_offset; /* log offset */ }; /* * LDM database header is 512 bytes long. */ #define LDM_VMDB_SIGN "VMDB" #define LDM_DB_LASTSEQ_OFF 0x004 #define LDM_DB_SIZE_OFF 0x008 #define LDM_DB_STATUS_OFF 0x010 #define LDM_DB_VERSION_OFF 0x012 #define LDM_DB_DGNAME_OFF 0x016 #define LDM_DB_DGGUID_OFF 0x035 struct ldm_vmdbhdr { uint32_t last_seq; /* sequence number of last VBLK */ uint32_t size; /* size of VBLK */ }; /* * The LDM database configuration section contains VMDB header and * many VBLKs. Each VBLK represents a disk group, disk partition, * component or volume. * * The most interesting for us are volumes, they are represents * partitions in the GEOM_PART meaning. But volume VBLK does not * contain all information needed to create GEOM provider. And we * should get this information from the related VBLK. This is how * VBLK releated: * Volumes <- Components <- Partitions -> Disks * * One volume can contain several components. In this case LDM * does mirroring of volume data to each component. * * Also each component can contain several partitions (spanned or * striped volumes). */ struct ldm_component { uint64_t id; /* object id */ uint64_t vol_id; /* parent volume object id */ int count; LIST_HEAD(, ldm_partition) partitions; LIST_ENTRY(ldm_component) entry; }; struct ldm_volume { uint64_t id; /* object id */ uint64_t size; /* volume size */ uint8_t number; /* used for ordering */ uint8_t part_type; /* partition type */ int count; LIST_HEAD(, ldm_component) components; LIST_ENTRY(ldm_volume) entry; }; struct ldm_disk { uint64_t id; /* object id */ struct uuid guid; /* disk guid */ LIST_ENTRY(ldm_disk) entry; }; #if 0 struct ldm_disk_group { uint64_t id; /* object id */ struct uuid guid; /* disk group guid */ u_char name[32]; /* disk group name */ LIST_ENTRY(ldm_disk_group) entry; }; #endif struct ldm_partition { uint64_t id; /* object id */ uint64_t disk_id; /* disk object id */ uint64_t comp_id; /* parent component object id */ uint64_t start; /* offset relative to disk start */ uint64_t offset; /* offset for spanned volumes */ uint64_t size; /* partition size */ LIST_ENTRY(ldm_partition) entry; }; /* * Each VBLK is 128 bytes long and has standard 16 bytes header. * Some of VBLK's fields are fixed size, but others has variable size. * Fields with variable size are prefixed with one byte length marker. * Some fields are strings and also can have fixed size and variable. * Strings with fixed size are NULL-terminated, others are not. * All VBLKs have same several first fields: * Offset Size Description * ---------------+---------------+-------------------------- * 0x00 16 standard VBLK header * 0x10 2 update status * 0x13 1 VBLK type * 0x18 PS object id * 0x18+ PN object name * * o Offset 0x18+ means '0x18 + length of all variable-width fields' * o 'P' in size column means 'prefixed' (variable-width), * 'S' - string, 'N' - number. */ #define LDM_VBLK_SIGN "VBLK" #define LDM_VBLK_SEQ_OFF 0x04 #define LDM_VBLK_GROUP_OFF 0x08 #define LDM_VBLK_INDEX_OFF 0x0c #define LDM_VBLK_COUNT_OFF 0x0e #define LDM_VBLK_TYPE_OFF 0x13 #define LDM_VBLK_OID_OFF 0x18 struct ldm_vblkhdr { uint32_t seq; /* sequence number */ uint32_t group; /* group number */ uint16_t index; /* index in the group */ uint16_t count; /* number of entries in the group */ }; #define LDM_VBLK_T_COMPONENT 0x32 #define LDM_VBLK_T_PARTITION 0x33 #define LDM_VBLK_T_DISK 0x34 #define LDM_VBLK_T_DISKGROUP 0x35 #define LDM_VBLK_T_DISK4 0x44 #define LDM_VBLK_T_DISKGROUP4 0x45 #define LDM_VBLK_T_VOLUME 0x51 struct ldm_vblk { uint8_t type; /* VBLK type */ union { uint64_t id; struct ldm_volume vol; struct ldm_component comp; struct ldm_disk disk; struct ldm_partition part; #if 0 struct ldm_disk_group disk_group; #endif } u; LIST_ENTRY(ldm_vblk) entry; }; /* * Some VBLKs contains a bit more data than can fit into 128 bytes. These * VBLKs are called eXtended VBLK. Before parsing, the data from these VBLK * should be placed into continuous memory buffer. We can determine xVBLK * by the count field in the standard VBLK header (count > 1). */ struct ldm_xvblk { uint32_t group; /* xVBLK group number */ uint32_t size; /* the total size of xVBLK */ uint8_t map; /* bitmask of currently saved VBLKs */ u_char *data; /* xVBLK data */ LIST_ENTRY(ldm_xvblk) entry; }; /* The internal representation of LDM database. */ struct ldm_db { struct ldm_privhdr ph; /* private header */ struct ldm_tochdr th; /* TOC header */ struct ldm_vmdbhdr dh; /* VMDB header */ LIST_HEAD(, ldm_volume) volumes; LIST_HEAD(, ldm_disk) disks; LIST_HEAD(, ldm_vblk) vblks; LIST_HEAD(, ldm_xvblk) xvblks; }; static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA; struct g_part_ldm_table { struct g_part_table base; uint64_t db_offset; int is_gpt; }; struct g_part_ldm_entry { struct g_part_entry base; uint8_t type; }; static int g_part_ldm_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_ldm_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_create(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_ldm_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_ldm_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_ldm_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_ldm_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_probe(struct g_part_table *, struct g_consumer *); static int g_part_ldm_read(struct g_part_table *, struct g_consumer *); static const char *g_part_ldm_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_write(struct g_part_table *, struct g_consumer *); static kobj_method_t g_part_ldm_methods[] = { KOBJMETHOD(g_part_add, g_part_ldm_add), KOBJMETHOD(g_part_bootcode, g_part_ldm_bootcode), KOBJMETHOD(g_part_create, g_part_ldm_create), KOBJMETHOD(g_part_destroy, g_part_ldm_destroy), KOBJMETHOD(g_part_dumpconf, g_part_ldm_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_ldm_dumpto), KOBJMETHOD(g_part_modify, g_part_ldm_modify), KOBJMETHOD(g_part_name, g_part_ldm_name), KOBJMETHOD(g_part_probe, g_part_ldm_probe), KOBJMETHOD(g_part_read, g_part_ldm_read), KOBJMETHOD(g_part_type, g_part_ldm_type), KOBJMETHOD(g_part_write, g_part_ldm_write), { 0, 0 } }; static struct g_part_scheme g_part_ldm_scheme = { "LDM", g_part_ldm_methods, sizeof(struct g_part_ldm_table), .gps_entrysz = sizeof(struct g_part_ldm_entry) }; G_PART_SCHEME_DECLARE(g_part_ldm); static struct g_part_ldm_alias { u_char typ; int alias; } ldm_alias_match[] = { { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_LDM, G_PART_ALIAS_MS_LDM_DATA }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, }; static u_char* ldm_privhdr_read(struct g_consumer *cp, uint64_t off, int *error) { struct g_provider *pp; u_char *buf; pp = cp->provider; buf = g_read_data(cp, off, pp->sectorsize, error); if (buf == NULL) return (NULL); if (memcmp(buf, LDM_PH_SIGN, strlen(LDM_PH_SIGN)) != 0) { LDM_DEBUG(1, "%s: invalid LDM private header signature", pp->name); g_free(buf); buf = NULL; *error = EINVAL; } return (buf); } static int ldm_privhdr_parse(struct g_consumer *cp, struct ldm_privhdr *hdr, const u_char *buf) { uint32_t version; int error; memset(hdr, 0, sizeof(*hdr)); version = be32dec(buf + LDM_PH_VERSION_OFF); if (version != LDM_VERSION_2K && version != LDM_VERSION_VISTA) { LDM_DEBUG(0, "%s: unsupported LDM version %u.%u", cp->provider->name, version >> 16, version & 0xFFFF); return (ENXIO); } error = parse_uuid(buf + LDM_PH_DISKGUID_OFF, &hdr->disk_guid); if (error != 0) return (error); error = parse_uuid(buf + LDM_PH_DGGUID_OFF, &hdr->dg_guid); if (error != 0) return (error); strncpy(hdr->dg_name, buf + LDM_PH_DGNAME_OFF, sizeof(hdr->dg_name)); hdr->start = be64dec(buf + LDM_PH_START_OFF); hdr->size = be64dec(buf + LDM_PH_SIZE_OFF); hdr->db_offset = be64dec(buf + LDM_PH_DB_OFF); hdr->db_size = be64dec(buf + LDM_PH_DBSIZE_OFF); hdr->th_offset[0] = be64dec(buf + LDM_PH_TH1_OFF); hdr->th_offset[1] = be64dec(buf + LDM_PH_TH2_OFF); hdr->conf_size = be64dec(buf + LDM_PH_CONFSIZE_OFF); hdr->log_size = be64dec(buf + LDM_PH_LOGSIZE_OFF); return (0); } static int ldm_privhdr_check(struct ldm_db *db, struct g_consumer *cp, int is_gpt) { struct g_consumer *cp2; struct g_provider *pp; struct ldm_privhdr hdr; uint64_t offset, last; int error, found, i; u_char *buf; pp = cp->provider; if (is_gpt) { /* * The last LBA is used in several checks below, for the * GPT case it should be calculated relative to the whole * disk. */ cp2 = LIST_FIRST(&pp->geom->consumer); last = cp2->provider->mediasize / cp2->provider->sectorsize - 1; } else last = pp->mediasize / pp->sectorsize - 1; for (found = 0, i = is_gpt; i < nitems(ldm_ph_off); i++) { offset = ldm_ph_off[i]; /* * In the GPT case consumer is attached to the LDM metadata * partition and we don't need add db_offset. */ if (!is_gpt) offset += db->ph.db_offset; if (i == LDM_PH_MBRINDEX) { /* * Prepare to errors and setup new base offset * to read backup private headers. Assume that LDM * database is in the last 1Mbyte area. */ db->ph.db_offset = last - LDM_DB_SIZE; } buf = ldm_privhdr_read(cp, offset * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read private header " "%d at LBA %ju", pp->name, i, (uintmax_t)offset); continue; } error = ldm_privhdr_parse(cp, &hdr, buf); if (error != 0) { LDM_DEBUG(1, "%s: failed to parse private " "header %d", pp->name, i); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (hdr.start > last || hdr.start + hdr.size - 1 > last || (hdr.start + hdr.size - 1 > hdr.db_offset && !is_gpt) || hdr.db_size != LDM_DB_SIZE || hdr.db_offset + LDM_DB_SIZE - 1 > last || hdr.th_offset[0] >= LDM_DB_SIZE || hdr.th_offset[1] >= LDM_DB_SIZE || hdr.conf_size + hdr.log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "private header %d", pp->name, i); LDM_DEBUG(2, "%s: start: %jd, size: %jd, " "db_offset: %jd, db_size: %jd, th_offset0: %jd, " "th_offset1: %jd, conf_size: %jd, log_size: %jd, " "last: %jd", pp->name, hdr.start, hdr.size, hdr.db_offset, hdr.db_size, hdr.th_offset[0], hdr.th_offset[1], hdr.conf_size, hdr.log_size, last); continue; } if (found != 0 && memcmp(&db->ph, &hdr, sizeof(hdr)) != 0) { LDM_DEBUG(0, "%s: private headers are not equal", pp->name); if (i > 1) { /* * We have different headers in the LDM. * We can not trust this metadata. */ LDM_DEBUG(0, "%s: refuse LDM metadata", pp->name); return (EINVAL); } /* * We already have read primary private header * and it differs from this backup one. * Prefer the backup header and save it. */ found = 0; } if (found == 0) memcpy(&db->ph, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(1, "%s: valid LDM private header not found", pp->name); return (ENXIO); } return (0); } static int ldm_gpt_check(struct ldm_db *db, struct g_consumer *cp) { struct g_part_table *gpt; struct g_part_entry *e; struct g_consumer *cp2; int error; cp2 = LIST_NEXT(cp, consumer); g_topology_lock(); gpt = cp->provider->geom->softc; error = 0; LIST_FOREACH(e, &gpt->gpt_entry, gpe_entry) { if (cp->provider == e->gpe_pp) { /* ms-ldm-metadata partition */ if (e->gpe_start != db->ph.db_offset || e->gpe_end != db->ph.db_offset + LDM_DB_SIZE - 1) error++; } else if (cp2->provider == e->gpe_pp) { /* ms-ldm-data partition */ if (e->gpe_start != db->ph.start || e->gpe_end != db->ph.start + db->ph.size - 1) error++; } if (error != 0) { LDM_DEBUG(0, "%s: GPT partition %d boundaries " "do not match with the LDM metadata", e->gpe_pp->name, e->gpe_index); error = ENXIO; break; } } g_topology_unlock(); return (error); } static int ldm_tochdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_tochdr hdr; uint64_t offset, conf_size, log_size; int error, found, i; u_char *buf; pp = cp->provider; for (i = 0, found = 0; i < LDM_TH_COUNT; i++) { offset = db->ph.db_offset + db->ph.th_offset[i]; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); continue; } if (memcmp(buf, LDM_TH_SIGN, strlen(LDM_TH_SIGN)) != 0 || memcmp(buf + LDM_TH_NAME1_OFF, LDM_TH_NAME1, strlen(LDM_TH_NAME1)) != 0 || memcmp(buf + LDM_TH_NAME2_OFF, LDM_TH_NAME2, strlen(LDM_TH_NAME2)) != 0) { LDM_DEBUG(1, "%s: failed to parse TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } hdr.conf_offset = be64dec(buf + LDM_TH_CONF_OFF); hdr.log_offset = be64dec(buf + LDM_TH_LOG_OFF); conf_size = be64dec(buf + LDM_TH_CONFSIZE_OFF); log_size = be64dec(buf + LDM_TH_LOGSIZE_OFF); if (conf_size != db->ph.conf_size || hdr.conf_offset + conf_size >= LDM_DB_SIZE || log_size != db->ph.log_size || hdr.log_offset + log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "TOC header at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (found == 0) memcpy(&db->th, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(0, "%s: valid LDM TOC header not found.", pp->name); return (ENXIO); } return (0); } static int ldm_vmdbhdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct uuid dg_guid; uint64_t offset; uint32_t version; int error; u_char *buf; pp = cp->provider; offset = db->ph.db_offset + db->th.conf_offset; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (error); } if (memcmp(buf, LDM_VMDB_SIGN, strlen(LDM_VMDB_SIGN)) != 0) { g_free(buf); LDM_DEBUG(0, "%s: failed to parse VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (ENXIO); } /* Check version. */ version = be32dec(buf + LDM_DB_VERSION_OFF); if (version != 0x4000A) { g_free(buf); LDM_DEBUG(0, "%s: unsupported VMDB version %u.%u", pp->name, version >> 16, version & 0xFFFF); return (ENXIO); } /* * Check VMDB update status: * 1 - in a consistent state; * 2 - in a creation phase; * 3 - in a deletion phase; */ if (be16dec(buf + LDM_DB_STATUS_OFF) != 1) { g_free(buf); LDM_DEBUG(0, "%s: VMDB is not in a consistent state", pp->name); return (ENXIO); } db->dh.last_seq = be32dec(buf + LDM_DB_LASTSEQ_OFF); db->dh.size = be32dec(buf + LDM_DB_SIZE_OFF); error = parse_uuid(buf + LDM_DB_DGGUID_OFF, &dg_guid); /* Compare disk group name and guid from VMDB and private headers */ if (error != 0 || db->dh.size == 0 || pp->sectorsize % db->dh.size != 0 || strncmp(buf + LDM_DB_DGNAME_OFF, db->ph.dg_name, 31) != 0 || memcmp(&dg_guid, &db->ph.dg_guid, sizeof(dg_guid)) != 0 || db->dh.size * db->dh.last_seq > db->ph.conf_size * pp->sectorsize) { LDM_DEBUG(0, "%s: invalid values in the VMDB header", pp->name); LDM_DUMP(buf, pp->sectorsize); g_free(buf); return (EINVAL); } g_free(buf); return (0); } static int ldm_xvblk_handle(struct ldm_db *db, struct ldm_vblkhdr *vh, const u_char *p) { struct ldm_xvblk *blk; size_t size; size = db->dh.size - 16; LIST_FOREACH(blk, &db->xvblks, entry) if (blk->group == vh->group) break; if (blk == NULL) { blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->group = vh->group; blk->size = size * vh->count + 16; blk->data = g_malloc(blk->size, M_WAITOK | M_ZERO); blk->map = 0xFF << vh->count; LIST_INSERT_HEAD(&db->xvblks, blk, entry); } if ((blk->map & (1 << vh->index)) != 0) { /* Block with given index has been already saved. */ return (EINVAL); } /* Copy the data block to the place related to index. */ memcpy(blk->data + size * vh->index + 16, p + 16, size); blk->map |= 1 << vh->index; return (0); } /* Read the variable-width numeric field and return new offset */ static int ldm_vnum_get(const u_char *buf, int offset, uint64_t *result, size_t range) { uint64_t num; uint8_t len; len = buf[offset++]; if (len > sizeof(uint64_t) || len + offset >= range) return (-1); for (num = 0; len > 0; len--) num = (num << 8) | buf[offset++]; *result = num; return (offset); } /* Read the variable-width string and return new offset */ static int ldm_vstr_get(const u_char *buf, int offset, u_char *result, size_t maxlen, size_t range) { uint8_t len; len = buf[offset++]; if (len >= maxlen || len + offset >= range) return (-1); memcpy(result, buf + offset, len); result[len] = '\0'; return (offset + len); } /* Just skip the variable-width variable and return new offset */ static int ldm_vparm_skip(const u_char *buf, int offset, size_t range) { uint8_t len; len = buf[offset++]; if (offset + len >= range) return (-1); return (offset + len); } static int ldm_vblk_handle(struct ldm_db *db, const u_char *p, size_t size) { struct ldm_vblk *blk; struct ldm_volume *volume, *last; const char *errstr; u_char vstr[64]; int error, offset; blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->type = p[LDM_VBLK_TYPE_OFF]; offset = ldm_vnum_get(p, LDM_VBLK_OID_OFF, &blk->u.id, size); if (offset < 0) { errstr = "object id"; goto fail; } offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "object name"; goto fail; } switch (blk->type) { /* * Component VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume state * 0x18+5 PN component children count * 0x1D+16 PN parent's volume object id * 0x2D+1 PN stripe size */ case LDM_VBLK_T_COMPONENT: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume state"; goto fail; } offset = ldm_vparm_skip(p, offset + 5, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.comp.vol_id, size); if (offset < 0) { errstr = "volume id"; goto fail; } break; /* * Partition VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+12 8 partition start offset * 0x18+20 8 volume offset * 0x18+28 PN partition size * 0x34+ PN parent's component object id * 0x34+ PN disk's object id */ case LDM_VBLK_T_PARTITION: if (offset + 28 >= size) { errstr = "too small buffer"; goto fail; } blk->u.part.start = be64dec(p + offset + 12); blk->u.part.offset = be64dec(p + offset + 20); offset = ldm_vnum_get(p, offset + 28, &blk->u.part.size, size); if (offset < 0) { errstr = "partition size"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.comp_id, size); if (offset < 0) { errstr = "component id"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.disk_id, size); if (offset < 0) { errstr = "disk id"; goto fail; } break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk GUID */ case LDM_VBLK_T_DISK: errstr = "disk guid"; offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) goto fail; error = parse_uuid(vstr, &blk->u.disk.guid); if (error != 0) goto fail; LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk group GUID */ case LDM_VBLK_T_DISKGROUP: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "disk group guid"; goto fail; } error = parse_uuid(name, &blk->u.disk_group.guid); if (error != 0) { errstr = "disk group guid"; goto fail; } LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISK4: be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISKGROUP4: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Volume VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume type * 0x18+ PS unknown * 0x18+ 14(S) volume state * 0x18+16 1 volume number * 0x18+21 PN volume children count * 0x2D+16 PN volume size * 0x3D+4 1 partition type */ case LDM_VBLK_T_VOLUME: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume type"; goto fail; } offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "unknown param"; goto fail; } if (offset + 21 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.number = p[offset + 16]; offset = ldm_vparm_skip(p, offset + 21, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.vol.size, size); if (offset < 0) { errstr = "volume size"; goto fail; } if (offset + 4 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.part_type = p[offset + 4]; /* keep volumes ordered by volume number */ last = NULL; LIST_FOREACH(volume, &db->volumes, entry) { if (volume->number > blk->u.vol.number) break; last = volume; } if (last != NULL) LIST_INSERT_AFTER(last, &blk->u.vol, entry); else LIST_INSERT_HEAD(&db->volumes, &blk->u.vol, entry); break; default: LDM_DEBUG(1, "unknown VBLK type 0x%02x\n", blk->type); LDM_DUMP(p, size); } LIST_INSERT_HEAD(&db->vblks, blk, entry); return (0); fail: LDM_DEBUG(0, "failed to parse '%s' in VBLK of type 0x%02x\n", errstr, blk->type); LDM_DUMP(p, size); g_free(blk); return (EINVAL); } static void ldm_vmdb_free(struct ldm_db *db) { struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } while (!LIST_EMPTY(&db->vblks)) { vblk = LIST_FIRST(&db->vblks); LIST_REMOVE(vblk, entry); g_free(vblk); } } static int ldm_vmdb_parse(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; struct ldm_volume *volume; struct ldm_component *comp; struct ldm_vblkhdr vh; u_char *buf, *p; size_t size, n, sectors; uint64_t offset; int error; pp = cp->provider; size = howmany(db->dh.last_seq * db->dh.size, pp->sectorsize); size -= 1; /* one sector takes vmdb header */ for (n = 0; n < size; n += MAXPHYS / pp->sectorsize) { offset = db->ph.db_offset + db->th.conf_offset + n + 1; sectors = (size - n) > (MAXPHYS / pp->sectorsize) ? MAXPHYS / pp->sectorsize: size - n; /* read VBLKs */ buf = g_read_data(cp, offset * pp->sectorsize, sectors * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VBLK\n", pp->name); goto fail; } for (p = buf; p < buf + sectors * pp->sectorsize; p += db->dh.size) { if (memcmp(p, LDM_VBLK_SIGN, strlen(LDM_VBLK_SIGN)) != 0) { LDM_DEBUG(0, "%s: no VBLK signature\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } vh.seq = be32dec(p + LDM_VBLK_SEQ_OFF); vh.group = be32dec(p + LDM_VBLK_GROUP_OFF); /* skip empty blocks */ if (vh.seq == 0 || vh.group == 0) continue; vh.index = be16dec(p + LDM_VBLK_INDEX_OFF); vh.count = be16dec(p + LDM_VBLK_COUNT_OFF); if (vh.count == 0 || vh.count > 4 || vh.seq > db->dh.last_seq) { LDM_DEBUG(0, "%s: invalid values " "in the VBLK header\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } if (vh.count > 1) { error = ldm_xvblk_handle(db, &vh, p); if (error != 0) { LDM_DEBUG(0, "%s: xVBLK " "is corrupted\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } continue; } if (be16dec(p + 16) != 0) LDM_DEBUG(1, "%s: VBLK update" " status is %u\n", pp->name, be16dec(p + 16)); error = ldm_vblk_handle(db, p, db->dh.size); if (error != 0) goto fail; } g_free(buf); buf = NULL; } /* Parse xVBLKs */ while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); if (xvblk->map == 0xFF) { error = ldm_vblk_handle(db, xvblk->data, xvblk->size); if (error != 0) goto fail; } else { LDM_DEBUG(0, "%s: incomplete or corrupt " "xVBLK found\n", pp->name); goto fail; } LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } /* construct all VBLKs relations */ LIST_FOREACH(volume, &db->volumes, entry) { LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_COMPONENT && vblk->u.comp.vol_id == volume->id) { LIST_INSERT_HEAD(&volume->components, &vblk->u.comp, entry); volume->count++; } LIST_FOREACH(comp, &volume->components, entry) LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_PARTITION && vblk->u.part.comp_id == comp->id) { LIST_INSERT_HEAD(&comp->partitions, &vblk->u.part, entry); comp->count++; } } return (0); fail: ldm_vmdb_free(db); g_free(buf); return (ENXIO); } static int g_part_ldm_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_create(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_ldm_table *table; struct g_provider *pp; table = (struct g_part_ldm_table *)basetable; /* * To destroy LDM on a disk partitioned with GPT we should delete * ms-ldm-metadata partition, but we can't do this via standard * GEOM_PART method. */ if (table->is_gpt) return (ENOSYS); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; /* * To destroy LDM we should wipe MBR, first private header and * backup private headers. */ basetable->gpt_smhead = (1 << ldm_ph_off[0]) | 1; /* * Don't touch last backup private header when LDM database is * not located in the last 1MByte area. * XXX: can't remove all blocks. */ if (table->db_offset + LDM_DB_SIZE == pp->mediasize / pp->sectorsize) basetable->gpt_smtail = 1; return (0); } static void g_part_ldm_dumpconf(struct g_part_table *basetable, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_ldm_entry *entry; entry = (struct g_part_ldm_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs LDM xt %u", entry->type); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->type); } else { /* confxml: scheme information */ } } static int g_part_ldm_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { return (0); } static int g_part_ldm_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static const char * g_part_ldm_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "s%d", baseentry->gpe_index); return (buf); } static int ldm_gpt_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_table *gpt; struct g_part_entry *entry; struct g_consumer *cp2; struct gpt_ent *part; u_char *buf; int error; /* * XXX: We use some knowledge about GEOM_PART_GPT internal * structures, but it is easier than parse GPT by himself. */ g_topology_lock(); gpt = cp->provider->geom->softc; LIST_FOREACH(entry, &gpt->gpt_entry, gpe_entry) { part = (struct gpt_ent *)(entry + 1); /* Search ms-ldm-metadata partition */ if (memcmp(&part->ent_type, &gpt_uuid_ms_ldm_metadata, sizeof(struct uuid)) != 0 || entry->gpe_end - entry->gpe_start < LDM_DB_SIZE - 1) continue; /* Create new consumer and attach it to metadata partition */ cp2 = g_new_consumer(cp->geom); error = g_attach(cp2, entry->gpe_pp); if (error != 0) { g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } error = g_access(cp2, 1, 0, 0); if (error != 0) { g_detach(cp2); g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } g_topology_unlock(); LDM_DEBUG(2, "%s: LDM metadata partition %s found in the GPT", cp->provider->name, cp2->provider->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp2, ldm_ph_off[LDM_PH_GPTINDEX] * cp2->provider->sectorsize, &error); if (buf != NULL) { table = (struct g_part_ldm_table *)basetable; table->is_gpt = 1; g_free(buf); return (G_PART_PROBE_PRI_HIGH); } /* second consumer is no longer needed. */ g_topology_lock(); g_access(cp2, -1, 0, 0); g_detach(cp2); g_destroy_consumer(cp2); break; } g_topology_unlock(); return (ENXIO); } static int g_part_ldm_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; u_char *buf, type[64]; int error, idx; pp = cp->provider; if (pp->sectorsize != 512) return (ENXIO); error = g_getattr("PART::scheme", cp, &type); if (error == 0 && strcmp(type, "GPT") == 0) { if (g_getattr("PART::type", cp, &type) != 0 || strcmp(type, "ms-ldm-data") != 0) return (ENXIO); error = ldm_gpt_probe(basetable, cp); return (error); } if (basetable->gpt_depth != 0) return (ENXIO); /* LDM has 1M metadata area */ if (pp->mediasize <= 1024 * 1024) return (ENOSPC); /* Check that there's a MBR */ buf = g_read_data(cp, 0, pp->sectorsize, &error); if (buf == NULL) return (error); if (le16dec(buf + DOSMAGICOFFSET) != DOSMAGIC) { g_free(buf); return (ENXIO); } error = ENXIO; /* Check that we have LDM partitions in the MBR */ for (idx = 0; idx < NDOSPART && error != 0; idx++) { if (buf[DOSPARTOFF + idx * DOSPARTSIZE + 4] == DOSPTYP_LDM) error = 0; } g_free(buf); if (error == 0) { LDM_DEBUG(2, "%s: LDM data partitions found in MBR", pp->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp, ldm_ph_off[LDM_PH_MBRINDEX] * pp->sectorsize, &error); if (buf == NULL) return (error); g_free(buf); return (G_PART_PROBE_PRI_HIGH); } return (error); } static int g_part_ldm_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_ldm_entry *entry; struct g_consumer *cp2; struct ldm_component *comp; struct ldm_partition *part; struct ldm_volume *vol; struct ldm_disk *disk; struct ldm_db db; int error, index, skipped; table = (struct g_part_ldm_table *)basetable; memset(&db, 0, sizeof(db)); cp2 = cp; /* ms-ldm-data */ if (table->is_gpt) cp = LIST_FIRST(&cp->geom->consumer); /* ms-ldm-metadata */ /* Read and parse LDM private headers. */ error = ldm_privhdr_check(&db, cp, table->is_gpt); if (error != 0) goto gpt_cleanup; basetable->gpt_first = table->is_gpt ? 0: db.ph.start; basetable->gpt_last = basetable->gpt_first + db.ph.size - 1; table->db_offset = db.ph.db_offset; /* Make additional checks for GPT */ if (table->is_gpt) { error = ldm_gpt_check(&db, cp); if (error != 0) goto gpt_cleanup; /* * Now we should reset database offset to zero, because our * consumer cp is attached to the ms-ldm-metadata partition * and we don't need add db_offset to read from it. */ db.ph.db_offset = 0; } /* Read and parse LDM TOC headers. */ error = ldm_tochdr_check(&db, cp); if (error != 0) goto gpt_cleanup; /* Read and parse LDM VMDB header. */ error = ldm_vmdbhdr_check(&db, cp); if (error != 0) goto gpt_cleanup; error = ldm_vmdb_parse(&db, cp); /* * For the GPT case we must detach and destroy * second consumer before return. */ gpt_cleanup: if (table->is_gpt) { g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); cp = cp2; } if (error != 0) return (error); /* Search current disk in the disk list. */ LIST_FOREACH(disk, &db.disks, entry) if (memcmp(&disk->guid, &db.ph.disk_guid, sizeof(struct uuid)) == 0) break; if (disk == NULL) { LDM_DEBUG(1, "%s: no LDM volumes on this disk", cp->provider->name); ldm_vmdb_free(&db); return (ENXIO); } index = 1; LIST_FOREACH(vol, &db.volumes, entry) { LIST_FOREACH(comp, &vol->components, entry) { /* Skip volumes from different disks. */ part = LIST_FIRST(&comp->partitions); if (part->disk_id != disk->id) continue; skipped = 0; /* We don't support spanned and striped volumes. */ if (comp->count > 1 || part->offset != 0) { LDM_DEBUG(1, "%s: LDM volume component " "%ju has %u partitions. Skipped", cp->provider->name, (uintmax_t)comp->id, comp->count); skipped = 1; } /* * Allow mirrored volumes only when they are explicitly * allowed with kern.geom.part.ldm.show_mirrors=1. */ if (vol->count > 1 && show_mirrors == 0) { LDM_DEBUG(1, "%s: LDM volume %ju has %u " "components. Skipped", cp->provider->name, (uintmax_t)vol->id, vol->count); skipped = 1; } entry = (struct g_part_ldm_entry *)g_part_new_entry( basetable, index++, basetable->gpt_first + part->start, basetable->gpt_first + part->start + part->size - 1); /* * Mark skipped partition as ms-ldm-data partition. * We do not support them, but it is better to show * that we have something there, than just show * free space. */ if (skipped == 0) entry->type = vol->part_type; else entry->type = DOSPTYP_LDM; LDM_DEBUG(1, "%s: new volume id: %ju, start: %ju," " end: %ju, type: 0x%02x\n", cp->provider->name, (uintmax_t)part->id,(uintmax_t)part->start + basetable->gpt_first, (uintmax_t)part->start + part->size + basetable->gpt_first - 1, vol->part_type); } } ldm_vmdb_free(&db); return (error); } static const char * g_part_ldm_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_ldm_entry *entry; int i; entry = (struct g_part_ldm_entry *)baseentry; for (i = 0; i < nitems(ldm_alias_match); i++) { if (ldm_alias_match[i].typ == entry->type) return (g_part_alias_name(ldm_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->type); return (buf); } static int g_part_ldm_write(struct g_part_table *basetable, struct g_consumer *cp) { return (ENOSYS); } Index: head/sys/geom/part/g_part_mbr.c =================================================================== --- head/sys/geom/part/g_part_mbr.c (revision 326269) +++ head/sys/geom/part/g_part_mbr.c (revision 326270) @@ -1,613 +1,615 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007, 2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_mbr, "GEOM partitioning class for MBR support"); SYSCTL_DECL(_kern_geom_part); static SYSCTL_NODE(_kern_geom_part, OID_AUTO, mbr, CTLFLAG_RW, 0, "GEOM_PART_MBR Master Boot Record"); static u_int enforce_chs = 0; SYSCTL_UINT(_kern_geom_part_mbr, OID_AUTO, enforce_chs, CTLFLAG_RWTUN, &enforce_chs, 0, "Enforce alignment to CHS addressing"); #define MBRSIZE 512 struct g_part_mbr_table { struct g_part_table base; u_char mbr[MBRSIZE]; }; struct g_part_mbr_entry { struct g_part_entry base; struct dos_partition ent; }; static int g_part_mbr_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_mbr_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_mbr_create(struct g_part_table *, struct g_part_parms *); static int g_part_mbr_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_mbr_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_mbr_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_mbr_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_mbr_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_mbr_probe(struct g_part_table *, struct g_consumer *); static int g_part_mbr_read(struct g_part_table *, struct g_consumer *); static int g_part_mbr_setunset(struct g_part_table *, struct g_part_entry *, const char *, unsigned int); static const char *g_part_mbr_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_mbr_write(struct g_part_table *, struct g_consumer *); static int g_part_mbr_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_mbr_methods[] = { KOBJMETHOD(g_part_add, g_part_mbr_add), KOBJMETHOD(g_part_bootcode, g_part_mbr_bootcode), KOBJMETHOD(g_part_create, g_part_mbr_create), KOBJMETHOD(g_part_destroy, g_part_mbr_destroy), KOBJMETHOD(g_part_dumpconf, g_part_mbr_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_mbr_dumpto), KOBJMETHOD(g_part_modify, g_part_mbr_modify), KOBJMETHOD(g_part_resize, g_part_mbr_resize), KOBJMETHOD(g_part_name, g_part_mbr_name), KOBJMETHOD(g_part_probe, g_part_mbr_probe), KOBJMETHOD(g_part_read, g_part_mbr_read), KOBJMETHOD(g_part_setunset, g_part_mbr_setunset), KOBJMETHOD(g_part_type, g_part_mbr_type), KOBJMETHOD(g_part_write, g_part_mbr_write), { 0, 0 } }; static struct g_part_scheme g_part_mbr_scheme = { "MBR", g_part_mbr_methods, sizeof(struct g_part_mbr_table), .gps_entrysz = sizeof(struct g_part_mbr_entry), .gps_minent = NDOSPART, .gps_maxent = NDOSPART, .gps_bootcodesz = MBRSIZE, }; G_PART_SCHEME_DECLARE(g_part_mbr); static struct g_part_mbr_alias { u_char typ; int alias; } mbr_alias_match[] = { { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_EXT, G_PART_ALIAS_EBR }, { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, { DOSPTYP_FAT16, G_PART_ALIAS_MS_FAT16 }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_EXTLBA, G_PART_ALIAS_EBR }, { DOSPTYP_LDM, G_PART_ALIAS_MS_LDM_DATA }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, { DOSPTYP_PPCBOOT, G_PART_ALIAS_PREP_BOOT }, { DOSPTYP_VMFS, G_PART_ALIAS_VMFS }, { DOSPTYP_VMKDIAG, G_PART_ALIAS_VMKDIAG }, { DOSPTYP_APPLE_UFS, G_PART_ALIAS_APPLE_UFS }, { DOSPTYP_APPLE_BOOT, G_PART_ALIAS_APPLE_BOOT }, { DOSPTYP_HFS, G_PART_ALIAS_APPLE_HFS }, }; static int mbr_parse_type(const char *type, u_char *dp_typ) { const char *alias; char *endp; long lt; int i; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 256) return (EINVAL); *dp_typ = (u_char)lt; return (0); } for (i = 0; i < nitems(mbr_alias_match); i++) { alias = g_part_alias_name(mbr_alias_match[i].alias); if (strcasecmp(type, alias) == 0) { *dp_typ = mbr_alias_match[i].typ; return (0); } } return (EINVAL); } static int mbr_probe_bpb(u_char *bpb) { uint16_t secsz; uint8_t clstsz; #define PO2(x) ((x & (x - 1)) == 0) secsz = le16dec(bpb); if (secsz < 512 || secsz > 4096 || !PO2(secsz)) return (0); clstsz = bpb[2]; if (clstsz < 1 || clstsz > 128 || !PO2(clstsz)) return (0); #undef PO2 return (1); } static void mbr_set_chs(struct g_part_table *table, uint32_t lba, u_char *cylp, u_char *hdp, u_char *secp) { uint32_t cyl, hd, sec; sec = lba % table->gpt_sectors + 1; lba /= table->gpt_sectors; hd = lba % table->gpt_heads; lba /= table->gpt_heads; cyl = lba; if (cyl > 1023) sec = hd = cyl = ~0; *cylp = cyl & 0xff; *hdp = hd & 0xff; *secp = (sec & 0x3f) | ((cyl >> 2) & 0xc0); } static int mbr_align(struct g_part_table *basetable, uint32_t *start, uint32_t *size) { uint32_t sectors; if (enforce_chs == 0) return (0); sectors = basetable->gpt_sectors; if (*size < sectors) return (EINVAL); if (start != NULL && (*start % sectors)) { *size += (*start % sectors) - sectors; *start -= (*start % sectors) - sectors; } if (*size % sectors) *size -= (*size % sectors); if (*size < sectors) return (EINVAL); return (0); } static int g_part_mbr_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_mbr_entry *entry; uint32_t start, size; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_mbr_entry *)baseentry; start = gpp->gpp_start; size = gpp->gpp_size; if (mbr_align(basetable, &start, &size) != 0) return (EINVAL); if (baseentry->gpe_deleted) bzero(&entry->ent, sizeof(entry->ent)); KASSERT(baseentry->gpe_start <= start, ("%s", __func__)); KASSERT(baseentry->gpe_end >= start + size - 1, ("%s", __func__)); baseentry->gpe_start = start; baseentry->gpe_end = start + size - 1; entry->ent.dp_start = start; entry->ent.dp_size = size; mbr_set_chs(basetable, baseentry->gpe_start, &entry->ent.dp_scyl, &entry->ent.dp_shd, &entry->ent.dp_ssect); mbr_set_chs(basetable, baseentry->gpe_end, &entry->ent.dp_ecyl, &entry->ent.dp_ehd, &entry->ent.dp_esect); return (mbr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); } static int g_part_mbr_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_mbr_table *table; uint32_t dsn; if (gpp->gpp_codesize != MBRSIZE) return (ENODEV); table = (struct g_part_mbr_table *)basetable; dsn = *(uint32_t *)(table->mbr + DOSDSNOFF); bcopy(gpp->gpp_codeptr, table->mbr, DOSPARTOFF); if (dsn != 0) *(uint32_t *)(table->mbr + DOSDSNOFF) = dsn; return (0); } static int g_part_mbr_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_mbr_table *table; pp = gpp->gpp_provider; if (pp->sectorsize < MBRSIZE) return (ENOSPC); basetable->gpt_first = basetable->gpt_sectors; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; table = (struct g_part_mbr_table *)basetable; le16enc(table->mbr + DOSMAGICOFFSET, DOSMAGIC); return (0); } static int g_part_mbr_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { /* Wipe the first sector to clear the partitioning. */ basetable->gpt_smhead |= 1; return (0); } static void g_part_mbr_dumpconf(struct g_part_table *basetable, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_mbr_entry *entry; struct g_part_mbr_table *table; uint32_t dsn; table = (struct g_part_mbr_table *)basetable; entry = (struct g_part_mbr_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs MBR xt %u", entry->ent.dp_typ); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->ent.dp_typ); if (entry->ent.dp_flag & 0x80) sbuf_printf(sb, "%sactive\n", indent); dsn = le32dec(table->mbr + DOSDSNOFF); sbuf_printf(sb, "%sHD(%d,MBR,%#08x,%#jx,%#jx)", indent, entry->base.gpe_index, dsn, (intmax_t)entry->base.gpe_start, (intmax_t)(entry->base.gpe_end - entry->base.gpe_start + 1)); sbuf_printf(sb, "\n"); } else { /* confxml: scheme information */ } } static int g_part_mbr_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_mbr_entry *entry; /* Allow dumping to a FreeBSD partition or Linux swap partition only. */ entry = (struct g_part_mbr_entry *)baseentry; return ((entry->ent.dp_typ == DOSPTYP_386BSD || entry->ent.dp_typ == DOSPTYP_LINSWP) ? 1 : 0); } static int g_part_mbr_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_mbr_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_mbr_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (mbr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); return (0); } static int g_part_mbr_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_mbr_entry *entry; struct g_provider *pp; uint32_t size; if (baseentry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; return (0); } size = gpp->gpp_size; if (mbr_align(basetable, NULL, &size) != 0) return (EINVAL); /* XXX: prevent unexpected shrinking. */ pp = baseentry->gpe_pp; if ((g_debugflags & 0x10) == 0 && size < gpp->gpp_size && pp->mediasize / pp->sectorsize > size) return (EBUSY); entry = (struct g_part_mbr_entry *)baseentry; baseentry->gpe_end = baseentry->gpe_start + size - 1; entry->ent.dp_size = size; mbr_set_chs(basetable, baseentry->gpe_end, &entry->ent.dp_ecyl, &entry->ent.dp_ehd, &entry->ent.dp_esect); return (0); } static const char * g_part_mbr_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "s%d", baseentry->gpe_index); return (buf); } static int g_part_mbr_probe(struct g_part_table *table, struct g_consumer *cp) { char psn[8]; struct g_provider *pp; u_char *buf, *p; int error, index, res, sum; uint16_t magic; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize < MBRSIZE || pp->mediasize < pp->sectorsize) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* We don't nest under an MBR (see EBR instead). */ error = g_getattr("PART::scheme", cp, &psn); if (error == 0 && strcmp(psn, g_part_mbr_scheme.name) == 0) return (ELOOP); /* Check that there's a MBR. */ buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); /* We goto out on mismatch. */ res = ENXIO; magic = le16dec(buf + DOSMAGICOFFSET); if (magic != DOSMAGIC) goto out; for (index = 0; index < NDOSPART; index++) { p = buf + DOSPARTOFF + index * DOSPARTSIZE; if (p[0] != 0 && p[0] != 0x80) goto out; } /* * If the partition table does not consist of all zeroes, * assume we have a MBR. If it's all zeroes, we could have * a boot sector. For example, a boot sector that doesn't * have boot code -- common on non-i386 hardware. In that * case we check if we have a possible BPB. If so, then we * assume we have a boot sector instead. */ sum = 0; for (index = 0; index < NDOSPART * DOSPARTSIZE; index++) sum += buf[DOSPARTOFF + index]; if (sum != 0 || !mbr_probe_bpb(buf + 0x0b)) res = G_PART_PROBE_PRI_NORM; out: g_free(buf); return (res); } static int g_part_mbr_read(struct g_part_table *basetable, struct g_consumer *cp) { struct dos_partition ent; struct g_provider *pp; struct g_part_mbr_table *table; struct g_part_mbr_entry *entry; u_char *buf, *p; off_t chs, msize, first; u_int sectors, heads; int error, index; pp = cp->provider; table = (struct g_part_mbr_table *)basetable; first = basetable->gpt_sectors; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); bcopy(buf, table->mbr, sizeof(table->mbr)); for (index = NDOSPART - 1; index >= 0; index--) { p = buf + DOSPARTOFF + index * DOSPARTSIZE; ent.dp_flag = p[0]; ent.dp_shd = p[1]; ent.dp_ssect = p[2]; ent.dp_scyl = p[3]; ent.dp_typ = p[4]; ent.dp_ehd = p[5]; ent.dp_esect = p[6]; ent.dp_ecyl = p[7]; ent.dp_start = le32dec(p + 8); ent.dp_size = le32dec(p + 12); if (ent.dp_typ == 0 || ent.dp_typ == DOSPTYP_PMBR) continue; if (ent.dp_start == 0 || ent.dp_size == 0) continue; sectors = ent.dp_esect & 0x3f; if (sectors > basetable->gpt_sectors && !basetable->gpt_fixgeom) { g_part_geometry_heads(msize, sectors, &chs, &heads); if (chs != 0) { basetable->gpt_sectors = sectors; basetable->gpt_heads = heads; } } if (ent.dp_start < first) first = ent.dp_start; entry = (struct g_part_mbr_entry *)g_part_new_entry(basetable, index + 1, ent.dp_start, ent.dp_start + ent.dp_size - 1); entry->ent = ent; } basetable->gpt_entries = NDOSPART; basetable->gpt_first = basetable->gpt_sectors; basetable->gpt_last = msize - 1; if (first < basetable->gpt_first) basetable->gpt_first = 1; g_free(buf); return (0); } static int g_part_mbr_setunset(struct g_part_table *table, struct g_part_entry *baseentry, const char *attrib, unsigned int set) { struct g_part_entry *iter; struct g_part_mbr_entry *entry; int changed; if (baseentry == NULL) return (ENODEV); if (strcasecmp(attrib, "active") != 0) return (EINVAL); /* Only one entry can have the active attribute. */ LIST_FOREACH(iter, &table->gpt_entry, gpe_entry) { if (iter->gpe_deleted) continue; changed = 0; entry = (struct g_part_mbr_entry *)iter; if (iter == baseentry) { if (set && (entry->ent.dp_flag & 0x80) == 0) { entry->ent.dp_flag |= 0x80; changed = 1; } else if (!set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } else { if (set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } if (changed && !iter->gpe_created) iter->gpe_modified = 1; } return (0); } static const char * g_part_mbr_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_mbr_entry *entry; int i; entry = (struct g_part_mbr_entry *)baseentry; for (i = 0; i < nitems(mbr_alias_match); i++) { if (mbr_alias_match[i].typ == entry->ent.dp_typ) return (g_part_alias_name(mbr_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->ent.dp_typ); return (buf); } static int g_part_mbr_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_entry *baseentry; struct g_part_mbr_entry *entry; struct g_part_mbr_table *table; u_char *p; int error, index; table = (struct g_part_mbr_table *)basetable; baseentry = LIST_FIRST(&basetable->gpt_entry); for (index = 1; index <= basetable->gpt_entries; index++) { p = table->mbr + DOSPARTOFF + (index - 1) * DOSPARTSIZE; entry = (baseentry != NULL && index == baseentry->gpe_index) ? (struct g_part_mbr_entry *)baseentry : NULL; if (entry != NULL && !baseentry->gpe_deleted) { p[0] = entry->ent.dp_flag; p[1] = entry->ent.dp_shd; p[2] = entry->ent.dp_ssect; p[3] = entry->ent.dp_scyl; p[4] = entry->ent.dp_typ; p[5] = entry->ent.dp_ehd; p[6] = entry->ent.dp_esect; p[7] = entry->ent.dp_ecyl; le32enc(p + 8, entry->ent.dp_start); le32enc(p + 12, entry->ent.dp_size); } else bzero(p, DOSPARTSIZE); if (entry != NULL) baseentry = LIST_NEXT(baseentry, gpe_entry); } error = g_write_data(cp, 0, table->mbr, cp->provider->sectorsize); return (error); } Index: head/sys/geom/part/g_part_vtoc8.c =================================================================== --- head/sys/geom/part/g_part_vtoc8.c (revision 326269) +++ head/sys/geom/part/g_part_vtoc8.c (revision 326270) @@ -1,599 +1,601 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_vtoc8, "GEOM partitioning class for SMI VTOC8 disk labels"); struct g_part_vtoc8_table { struct g_part_table base; struct vtoc8 vtoc; uint32_t secpercyl; }; static int g_part_vtoc8_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_vtoc8_create(struct g_part_table *, struct g_part_parms *); static int g_part_vtoc8_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_vtoc8_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_vtoc8_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_vtoc8_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_vtoc8_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_vtoc8_probe(struct g_part_table *, struct g_consumer *); static int g_part_vtoc8_read(struct g_part_table *, struct g_consumer *); static const char *g_part_vtoc8_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_vtoc8_write(struct g_part_table *, struct g_consumer *); static int g_part_vtoc8_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_vtoc8_methods[] = { KOBJMETHOD(g_part_add, g_part_vtoc8_add), KOBJMETHOD(g_part_create, g_part_vtoc8_create), KOBJMETHOD(g_part_destroy, g_part_vtoc8_destroy), KOBJMETHOD(g_part_dumpconf, g_part_vtoc8_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_vtoc8_dumpto), KOBJMETHOD(g_part_modify, g_part_vtoc8_modify), KOBJMETHOD(g_part_resize, g_part_vtoc8_resize), KOBJMETHOD(g_part_name, g_part_vtoc8_name), KOBJMETHOD(g_part_probe, g_part_vtoc8_probe), KOBJMETHOD(g_part_read, g_part_vtoc8_read), KOBJMETHOD(g_part_type, g_part_vtoc8_type), KOBJMETHOD(g_part_write, g_part_vtoc8_write), { 0, 0 } }; static struct g_part_scheme g_part_vtoc8_scheme = { "VTOC8", g_part_vtoc8_methods, sizeof(struct g_part_vtoc8_table), .gps_entrysz = sizeof(struct g_part_entry), .gps_minent = VTOC8_NPARTS, .gps_maxent = VTOC8_NPARTS, }; G_PART_SCHEME_DECLARE(g_part_vtoc8); static int vtoc8_parse_type(const char *type, uint16_t *tag) { const char *alias; char *endp; long lt; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 65536) return (EINVAL); *tag = (uint16_t)lt; return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS); if (!strcasecmp(type, alias)) { *tag = VTOC_TAG_FREEBSD_NANDFS; return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP); if (!strcasecmp(type, alias)) { *tag = VTOC_TAG_FREEBSD_SWAP; return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS); if (!strcasecmp(type, alias)) { *tag = VTOC_TAG_FREEBSD_UFS; return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM); if (!strcasecmp(type, alias)) { *tag = VTOC_TAG_FREEBSD_VINUM; return (0); } alias = g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS); if (!strcasecmp(type, alias)) { *tag = VTOC_TAG_FREEBSD_ZFS; return (0); } return (EINVAL); } static int vtoc8_align(struct g_part_vtoc8_table *table, uint64_t *start, uint64_t *size) { if (*size < table->secpercyl) return (EINVAL); if (start != NULL && (*start % table->secpercyl)) { *size += (*start % table->secpercyl) - table->secpercyl; *start -= (*start % table->secpercyl) - table->secpercyl; } if (*size % table->secpercyl) *size -= (*size % table->secpercyl); if (*size < table->secpercyl) return (EINVAL); return (0); } static int g_part_vtoc8_add(struct g_part_table *basetable, struct g_part_entry *entry, struct g_part_parms *gpp) { struct g_part_vtoc8_table *table; int error, index; uint64_t start, size; uint16_t tag; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); error = vtoc8_parse_type(gpp->gpp_type, &tag); if (error) return (error); table = (struct g_part_vtoc8_table *)basetable; index = entry->gpe_index - 1; start = gpp->gpp_start; size = gpp->gpp_size; if (vtoc8_align(table, &start, &size) != 0) return (EINVAL); KASSERT(entry->gpe_start <= start, (__func__)); KASSERT(entry->gpe_end >= start + size - 1, (__func__)); entry->gpe_start = start; entry->gpe_end = start + size - 1; be16enc(&table->vtoc.part[index].tag, tag); be16enc(&table->vtoc.part[index].flag, 0); be32enc(&table->vtoc.timestamp[index], 0); be32enc(&table->vtoc.map[index].cyl, start / table->secpercyl); be32enc(&table->vtoc.map[index].nblks, size); return (0); } static int g_part_vtoc8_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_entry *entry; struct g_part_vtoc8_table *table; uint64_t msize; uint32_t acyls, ncyls, pcyls; pp = gpp->gpp_provider; if (pp->sectorsize < sizeof(struct vtoc8)) return (ENOSPC); if (pp->sectorsize > sizeof(struct vtoc8)) return (ENXIO); table = (struct g_part_vtoc8_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); table->secpercyl = basetable->gpt_sectors * basetable->gpt_heads; pcyls = msize / table->secpercyl; acyls = 2; ncyls = pcyls - acyls; msize = ncyls * table->secpercyl; sprintf(table->vtoc.ascii, "FreeBSD%lldM cyl %u alt %u hd %u sec %u", (long long)(msize / 2048), ncyls, acyls, basetable->gpt_heads, basetable->gpt_sectors); be32enc(&table->vtoc.version, VTOC_VERSION); be16enc(&table->vtoc.nparts, VTOC8_NPARTS); be32enc(&table->vtoc.sanity, VTOC_SANITY); be16enc(&table->vtoc.rpm, 3600); be16enc(&table->vtoc.physcyls, pcyls); be16enc(&table->vtoc.ncyls, ncyls); be16enc(&table->vtoc.altcyls, acyls); be16enc(&table->vtoc.nheads, basetable->gpt_heads); be16enc(&table->vtoc.nsecs, basetable->gpt_sectors); be16enc(&table->vtoc.magic, VTOC_MAGIC); basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_isleaf = 1; entry = g_part_new_entry(basetable, VTOC_RAW_PART + 1, basetable->gpt_first, basetable->gpt_last); entry->gpe_internal = 1; be16enc(&table->vtoc.part[VTOC_RAW_PART].tag, VTOC_TAG_BACKUP); be32enc(&table->vtoc.map[VTOC_RAW_PART].nblks, msize); return (0); } static int g_part_vtoc8_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { /* Wipe the first sector to clear the partitioning. */ basetable->gpt_smhead |= 1; return (0); } static void g_part_vtoc8_dumpconf(struct g_part_table *basetable, struct g_part_entry *entry, struct sbuf *sb, const char *indent) { struct g_part_vtoc8_table *table; table = (struct g_part_vtoc8_table *)basetable; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs SUN sc %u hd %u alt %u", be16dec(&table->vtoc.nsecs), be16dec(&table->vtoc.nheads), be16dec(&table->vtoc.altcyls)); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, be16dec(&table->vtoc.part[entry->gpe_index - 1].tag)); } else { /* confxml: scheme information */ } } static int g_part_vtoc8_dumpto(struct g_part_table *basetable, struct g_part_entry *entry) { struct g_part_vtoc8_table *table; uint16_t tag; /* * Allow dumping to a swap partition or a partition that * has no type. */ table = (struct g_part_vtoc8_table *)basetable; tag = be16dec(&table->vtoc.part[entry->gpe_index - 1].tag); return ((tag == 0 || tag == VTOC_TAG_FREEBSD_SWAP || tag == VTOC_TAG_SWAP) ? 1 : 0); } static int g_part_vtoc8_modify(struct g_part_table *basetable, struct g_part_entry *entry, struct g_part_parms *gpp) { struct g_part_vtoc8_table *table; int error; uint16_t tag; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); table = (struct g_part_vtoc8_table *)basetable; if (gpp->gpp_parms & G_PART_PARM_TYPE) { error = vtoc8_parse_type(gpp->gpp_type, &tag); if (error) return(error); be16enc(&table->vtoc.part[entry->gpe_index - 1].tag, tag); } return (0); } static int vtoc8_set_rawsize(struct g_part_table *basetable, struct g_provider *pp) { struct g_part_vtoc8_table *table; struct g_part_entry *baseentry; off_t msize; uint32_t acyls, ncyls, pcyls; table = (struct g_part_vtoc8_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); pcyls = msize / table->secpercyl; if (pcyls > UINT16_MAX) return (ERANGE); acyls = be16dec(&table->vtoc.altcyls); ncyls = pcyls - acyls; msize = ncyls * table->secpercyl; basetable->gpt_last = msize - 1; bzero(table->vtoc.ascii, sizeof(table->vtoc.ascii)); sprintf(table->vtoc.ascii, "FreeBSD%lldM cyl %u alt %u hd %u sec %u", (long long)(msize / 2048), ncyls, acyls, basetable->gpt_heads, basetable->gpt_sectors); be16enc(&table->vtoc.physcyls, pcyls); be16enc(&table->vtoc.ncyls, ncyls); be32enc(&table->vtoc.map[VTOC_RAW_PART].nblks, msize); if (be32dec(&table->vtoc.sanity) == VTOC_SANITY) be16enc(&table->vtoc.part[VTOC_RAW_PART].tag, VTOC_TAG_BACKUP); LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_index == VTOC_RAW_PART + 1) { baseentry->gpe_end = basetable->gpt_last; return (0); } } return (ENXIO); } static int g_part_vtoc8_resize(struct g_part_table *basetable, struct g_part_entry *entry, struct g_part_parms *gpp) { struct g_part_vtoc8_table *table; struct g_provider *pp; uint64_t size; if (entry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; return (vtoc8_set_rawsize(basetable, pp)); } table = (struct g_part_vtoc8_table *)basetable; size = gpp->gpp_size; if (vtoc8_align(table, NULL, &size) != 0) return (EINVAL); /* XXX: prevent unexpected shrinking. */ pp = entry->gpe_pp; if ((g_debugflags & 0x10) == 0 && size < gpp->gpp_size && pp->mediasize / pp->sectorsize > size) return (EBUSY); entry->gpe_end = entry->gpe_start + size - 1; be32enc(&table->vtoc.map[entry->gpe_index - 1].nblks, size); return (0); } static const char * g_part_vtoc8_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "%c", 'a' + baseentry->gpe_index - 1); return (buf); } static int g_part_vtoc8_probe(struct g_part_table *table, struct g_consumer *cp) { struct g_provider *pp; u_char *buf; int error, ofs, res; uint16_t cksum, magic; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize != sizeof(struct vtoc8)) return (ENOSPC); /* Check that there's a disklabel. */ buf = g_read_data(cp, 0, pp->sectorsize, &error); if (buf == NULL) return (error); res = ENXIO; /* Assume mismatch */ /* Check the magic */ magic = be16dec(buf + offsetof(struct vtoc8, magic)); if (magic != VTOC_MAGIC) goto out; /* Check the sum */ cksum = 0; for (ofs = 0; ofs < sizeof(struct vtoc8); ofs += 2) cksum ^= be16dec(buf + ofs); if (cksum != 0) goto out; res = G_PART_PROBE_PRI_NORM; out: g_free(buf); return (res); } static int g_part_vtoc8_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_vtoc8_table *table; struct g_part_entry *entry; u_char *buf; off_t chs, msize; uint64_t offset, size; u_int cyls, heads, sectors; int error, index, withtags; uint16_t tag; pp = cp->provider; buf = g_read_data(cp, 0, pp->sectorsize, &error); if (buf == NULL) return (error); table = (struct g_part_vtoc8_table *)basetable; bcopy(buf, &table->vtoc, sizeof(table->vtoc)); g_free(buf); msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); sectors = be16dec(&table->vtoc.nsecs); if (sectors < 1) goto invalid_label; if (sectors != basetable->gpt_sectors && !basetable->gpt_fixgeom) { g_part_geometry_heads(msize, sectors, &chs, &heads); if (chs != 0) { basetable->gpt_sectors = sectors; basetable->gpt_heads = heads; } } heads = be16dec(&table->vtoc.nheads); if (heads < 1) goto invalid_label; if (heads != basetable->gpt_heads && !basetable->gpt_fixgeom) basetable->gpt_heads = heads; /* * Except for ATA disks > 32GB, Solaris uses the native geometry * as reported by the target for the labels while da(4) typically * uses a synthetic one so we don't complain too loudly if these * geometries don't match. */ if (bootverbose && (sectors != basetable->gpt_sectors || heads != basetable->gpt_heads)) printf("GEOM: %s: geometry does not match VTOC8 label " "(label: %uh,%us GEOM: %uh,%us).\n", pp->name, heads, sectors, basetable->gpt_heads, basetable->gpt_sectors); table->secpercyl = heads * sectors; cyls = be16dec(&table->vtoc.ncyls); chs = cyls * table->secpercyl; if (chs < 1 || chs > msize) goto invalid_label; basetable->gpt_first = 0; basetable->gpt_last = chs - 1; basetable->gpt_isleaf = 1; withtags = (be32dec(&table->vtoc.sanity) == VTOC_SANITY) ? 1 : 0; if (!withtags) { printf("GEOM: %s: adding VTOC8 information.\n", pp->name); be32enc(&table->vtoc.version, VTOC_VERSION); bzero(&table->vtoc.volume, VTOC_VOLUME_LEN); be16enc(&table->vtoc.nparts, VTOC8_NPARTS); bzero(&table->vtoc.part, sizeof(table->vtoc.part)); be32enc(&table->vtoc.sanity, VTOC_SANITY); } basetable->gpt_entries = be16dec(&table->vtoc.nparts); if (basetable->gpt_entries < g_part_vtoc8_scheme.gps_minent || basetable->gpt_entries > g_part_vtoc8_scheme.gps_maxent) goto invalid_label; for (index = basetable->gpt_entries - 1; index >= 0; index--) { offset = be32dec(&table->vtoc.map[index].cyl) * table->secpercyl; size = be32dec(&table->vtoc.map[index].nblks); if (size == 0) continue; if (withtags) tag = be16dec(&table->vtoc.part[index].tag); else tag = (index == VTOC_RAW_PART) ? VTOC_TAG_BACKUP : VTOC_TAG_UNASSIGNED; if (index == VTOC_RAW_PART && tag != VTOC_TAG_BACKUP) continue; if (index != VTOC_RAW_PART && tag == VTOC_TAG_BACKUP) continue; entry = g_part_new_entry(basetable, index + 1, offset, offset + size - 1); if (tag == VTOC_TAG_BACKUP) entry->gpe_internal = 1; if (!withtags) be16enc(&table->vtoc.part[index].tag, tag); } return (0); invalid_label: printf("GEOM: %s: invalid VTOC8 label.\n", pp->name); return (EINVAL); } static const char * g_part_vtoc8_type(struct g_part_table *basetable, struct g_part_entry *entry, char *buf, size_t bufsz) { struct g_part_vtoc8_table *table; uint16_t tag; table = (struct g_part_vtoc8_table *)basetable; tag = be16dec(&table->vtoc.part[entry->gpe_index - 1].tag); if (tag == VTOC_TAG_FREEBSD_NANDFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS)); if (tag == VTOC_TAG_FREEBSD_SWAP) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP)); if (tag == VTOC_TAG_FREEBSD_UFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS)); if (tag == VTOC_TAG_FREEBSD_VINUM) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM)); if (tag == VTOC_TAG_FREEBSD_ZFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS)); snprintf(buf, bufsz, "!%d", tag); return (buf); } static int g_part_vtoc8_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_entry *entry; struct g_part_vtoc8_table *table; uint16_t sum; u_char *p; int error, index, match, offset; pp = cp->provider; table = (struct g_part_vtoc8_table *)basetable; entry = LIST_FIRST(&basetable->gpt_entry); for (index = 0; index < basetable->gpt_entries; index++) { match = (entry != NULL && index == entry->gpe_index - 1) ? 1 : 0; if (match) { if (entry->gpe_deleted) { be16enc(&table->vtoc.part[index].tag, 0); be16enc(&table->vtoc.part[index].flag, 0); be32enc(&table->vtoc.map[index].cyl, 0); be32enc(&table->vtoc.map[index].nblks, 0); } entry = LIST_NEXT(entry, gpe_entry); } } /* Calculate checksum. */ sum = 0; p = (void *)&table->vtoc; for (offset = 0; offset < sizeof(table->vtoc) - 2; offset += 2) sum ^= be16dec(p + offset); be16enc(&table->vtoc.cksum, sum); error = g_write_data(cp, 0, p, pp->sectorsize); return (error); } Index: head/sys/geom/raid/g_raid.c =================================================================== --- head/sys/geom/raid/g_raid.c (revision 326269) +++ head/sys/geom/raid/g_raid.c (revision 326270) @@ -1,2575 +1,2577 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_raid_md_if.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff"); int g_raid_enable = 1; SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN, &g_raid_enable, 0, "Enable on-disk metadata taste"); u_int g_raid_aggressive_spare = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN, &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); u_int g_raid_debug = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0, "Debug level"); int g_raid_read_err_thresh = 10; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN, &g_raid_read_err_thresh, 0, "Number of read errors equated to disk failure"); u_int g_raid_start_timeout = 30; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN, &g_raid_start_timeout, 0, "Time to wait for all array components"); static u_int g_raid_clean_time = 5; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN, &g_raid_clean_time, 0, "Mark volume as clean when idling"); static u_int g_raid_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid_name_format = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN, &g_raid_name_format, 0, "Providers name format."); static u_int g_raid_idle_threshold = 1000000; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN, &g_raid_idle_threshold, 1000000, "Time in microseconds to consider a volume idle."); #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) LIST_HEAD(, g_raid_md_class) g_raid_md_classes = LIST_HEAD_INITIALIZER(g_raid_md_classes); LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = LIST_HEAD_INITIALIZER(g_raid_tr_classes); LIST_HEAD(, g_raid_volume) g_raid_volumes = LIST_HEAD_INITIALIZER(g_raid_volumes); static eventhandler_tag g_raid_post_sync = NULL; static int g_raid_started = 0; static int g_raid_shutdown = 0; static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid_taste; static void g_raid_init(struct g_class *mp); static void g_raid_fini(struct g_class *mp); struct g_class g_raid_class = { .name = G_RAID_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid_ctl, .taste = g_raid_taste, .destroy_geom = g_raid_destroy_geom, .init = g_raid_init, .fini = g_raid_fini }; static void g_raid_destroy_provider(struct g_raid_volume *vol); static int g_raid_update_disk(struct g_raid_disk *disk, u_int event); static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event); static int g_raid_update_volume(struct g_raid_volume *vol, u_int event); static int g_raid_update_node(struct g_raid_softc *sc, u_int event); static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid_start(struct bio *bp); static void g_raid_start_request(struct bio *bp); static void g_raid_disk_done(struct bio *bp); static void g_raid_poll(struct g_raid_softc *sc); static const char * g_raid_node_event2str(int event) { switch (event) { case G_RAID_NODE_E_WAKE: return ("WAKE"); case G_RAID_NODE_E_START: return ("START"); default: return ("INVALID"); } } const char * g_raid_disk_state2str(int state) { switch (state) { case G_RAID_DISK_S_NONE: return ("NONE"); case G_RAID_DISK_S_OFFLINE: return ("OFFLINE"); case G_RAID_DISK_S_DISABLED: return ("DISABLED"); case G_RAID_DISK_S_FAILED: return ("FAILED"); case G_RAID_DISK_S_STALE_FAILED: return ("STALE_FAILED"); case G_RAID_DISK_S_SPARE: return ("SPARE"); case G_RAID_DISK_S_STALE: return ("STALE"); case G_RAID_DISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_disk_event2str(int event) { switch (event) { case G_RAID_DISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_subdisk_state2str(int state) { switch (state) { case G_RAID_SUBDISK_S_NONE: return ("NONE"); case G_RAID_SUBDISK_S_FAILED: return ("FAILED"); case G_RAID_SUBDISK_S_NEW: return ("NEW"); case G_RAID_SUBDISK_S_REBUILD: return ("REBUILD"); case G_RAID_SUBDISK_S_UNINITIALIZED: return ("UNINITIALIZED"); case G_RAID_SUBDISK_S_STALE: return ("STALE"); case G_RAID_SUBDISK_S_RESYNC: return ("RESYNC"); case G_RAID_SUBDISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_subdisk_event2str(int event) { switch (event) { case G_RAID_SUBDISK_E_NEW: return ("NEW"); case G_RAID_SUBDISK_E_FAILED: return ("FAILED"); case G_RAID_SUBDISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_volume_state2str(int state) { switch (state) { case G_RAID_VOLUME_S_STARTING: return ("STARTING"); case G_RAID_VOLUME_S_BROKEN: return ("BROKEN"); case G_RAID_VOLUME_S_DEGRADED: return ("DEGRADED"); case G_RAID_VOLUME_S_SUBOPTIMAL: return ("SUBOPTIMAL"); case G_RAID_VOLUME_S_OPTIMAL: return ("OPTIMAL"); case G_RAID_VOLUME_S_UNSUPPORTED: return ("UNSUPPORTED"); case G_RAID_VOLUME_S_STOPPED: return ("STOPPED"); default: return ("INVALID"); } } static const char * g_raid_volume_event2str(int event) { switch (event) { case G_RAID_VOLUME_E_UP: return ("UP"); case G_RAID_VOLUME_E_DOWN: return ("DOWN"); case G_RAID_VOLUME_E_START: return ("START"); case G_RAID_VOLUME_E_STARTMD: return ("STARTMD"); default: return ("INVALID"); } } const char * g_raid_volume_level2str(int level, int qual) { switch (level) { case G_RAID_VOLUME_RL_RAID0: return ("RAID0"); case G_RAID_VOLUME_RL_RAID1: return ("RAID1"); case G_RAID_VOLUME_RL_RAID3: if (qual == G_RAID_VOLUME_RLQ_R3P0) return ("RAID3-P0"); if (qual == G_RAID_VOLUME_RLQ_R3PN) return ("RAID3-PN"); return ("RAID3"); case G_RAID_VOLUME_RL_RAID4: if (qual == G_RAID_VOLUME_RLQ_R4P0) return ("RAID4-P0"); if (qual == G_RAID_VOLUME_RLQ_R4PN) return ("RAID4-PN"); return ("RAID4"); case G_RAID_VOLUME_RL_RAID5: if (qual == G_RAID_VOLUME_RLQ_R5RA) return ("RAID5-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RS) return ("RAID5-RS"); if (qual == G_RAID_VOLUME_RLQ_R5LA) return ("RAID5-LA"); if (qual == G_RAID_VOLUME_RLQ_R5LS) return ("RAID5-LS"); return ("RAID5"); case G_RAID_VOLUME_RL_RAID6: if (qual == G_RAID_VOLUME_RLQ_R6RA) return ("RAID6-RA"); if (qual == G_RAID_VOLUME_RLQ_R6RS) return ("RAID6-RS"); if (qual == G_RAID_VOLUME_RLQ_R6LA) return ("RAID6-LA"); if (qual == G_RAID_VOLUME_RLQ_R6LS) return ("RAID6-LS"); return ("RAID6"); case G_RAID_VOLUME_RL_RAIDMDF: if (qual == G_RAID_VOLUME_RLQ_RMDFRA) return ("RAIDMDF-RA"); if (qual == G_RAID_VOLUME_RLQ_RMDFRS) return ("RAIDMDF-RS"); if (qual == G_RAID_VOLUME_RLQ_RMDFLA) return ("RAIDMDF-LA"); if (qual == G_RAID_VOLUME_RLQ_RMDFLS) return ("RAIDMDF-LS"); return ("RAIDMDF"); case G_RAID_VOLUME_RL_RAID1E: if (qual == G_RAID_VOLUME_RLQ_R1EA) return ("RAID1E-A"); if (qual == G_RAID_VOLUME_RLQ_R1EO) return ("RAID1E-O"); return ("RAID1E"); case G_RAID_VOLUME_RL_SINGLE: return ("SINGLE"); case G_RAID_VOLUME_RL_CONCAT: return ("CONCAT"); case G_RAID_VOLUME_RL_RAID5E: if (qual == G_RAID_VOLUME_RLQ_R5ERA) return ("RAID5E-RA"); if (qual == G_RAID_VOLUME_RLQ_R5ERS) return ("RAID5E-RS"); if (qual == G_RAID_VOLUME_RLQ_R5ELA) return ("RAID5E-LA"); if (qual == G_RAID_VOLUME_RLQ_R5ELS) return ("RAID5E-LS"); return ("RAID5E"); case G_RAID_VOLUME_RL_RAID5EE: if (qual == G_RAID_VOLUME_RLQ_R5EERA) return ("RAID5EE-RA"); if (qual == G_RAID_VOLUME_RLQ_R5EERS) return ("RAID5EE-RS"); if (qual == G_RAID_VOLUME_RLQ_R5EELA) return ("RAID5EE-LA"); if (qual == G_RAID_VOLUME_RLQ_R5EELS) return ("RAID5EE-LS"); return ("RAID5EE"); case G_RAID_VOLUME_RL_RAID5R: if (qual == G_RAID_VOLUME_RLQ_R5RRA) return ("RAID5R-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RRS) return ("RAID5R-RS"); if (qual == G_RAID_VOLUME_RLQ_R5RLA) return ("RAID5R-LA"); if (qual == G_RAID_VOLUME_RLQ_R5RLS) return ("RAID5R-LS"); return ("RAID5E"); default: return ("UNKNOWN"); } } int g_raid_volume_str2level(const char *str, int *level, int *qual) { *level = G_RAID_VOLUME_RL_UNKNOWN; *qual = G_RAID_VOLUME_RLQ_NONE; if (strcasecmp(str, "RAID0") == 0) *level = G_RAID_VOLUME_RL_RAID0; else if (strcasecmp(str, "RAID1") == 0) *level = G_RAID_VOLUME_RL_RAID1; else if (strcasecmp(str, "RAID3-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3P0; } else if (strcasecmp(str, "RAID3-PN") == 0 || strcasecmp(str, "RAID3") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3PN; } else if (strcasecmp(str, "RAID4-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4P0; } else if (strcasecmp(str, "RAID4-PN") == 0 || strcasecmp(str, "RAID4") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4PN; } else if (strcasecmp(str, "RAID5-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RA; } else if (strcasecmp(str, "RAID5-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RS; } else if (strcasecmp(str, "RAID5") == 0 || strcasecmp(str, "RAID5-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LA; } else if (strcasecmp(str, "RAID5-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LS; } else if (strcasecmp(str, "RAID6-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RA; } else if (strcasecmp(str, "RAID6-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RS; } else if (strcasecmp(str, "RAID6") == 0 || strcasecmp(str, "RAID6-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LA; } else if (strcasecmp(str, "RAID6-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LS; } else if (strcasecmp(str, "RAIDMDF-RA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRA; } else if (strcasecmp(str, "RAIDMDF-RS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRS; } else if (strcasecmp(str, "RAIDMDF") == 0 || strcasecmp(str, "RAIDMDF-LA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLA; } else if (strcasecmp(str, "RAIDMDF-LS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLS; } else if (strcasecmp(str, "RAID10") == 0 || strcasecmp(str, "RAID1E") == 0 || strcasecmp(str, "RAID1E-A") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EA; } else if (strcasecmp(str, "RAID1E-O") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EO; } else if (strcasecmp(str, "SINGLE") == 0) *level = G_RAID_VOLUME_RL_SINGLE; else if (strcasecmp(str, "CONCAT") == 0) *level = G_RAID_VOLUME_RL_CONCAT; else if (strcasecmp(str, "RAID5E-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERA; } else if (strcasecmp(str, "RAID5E-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERS; } else if (strcasecmp(str, "RAID5E") == 0 || strcasecmp(str, "RAID5E-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELA; } else if (strcasecmp(str, "RAID5E-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELS; } else if (strcasecmp(str, "RAID5EE-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERA; } else if (strcasecmp(str, "RAID5EE-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERS; } else if (strcasecmp(str, "RAID5EE") == 0 || strcasecmp(str, "RAID5EE-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELA; } else if (strcasecmp(str, "RAID5EE-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELS; } else if (strcasecmp(str, "RAID5R-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRA; } else if (strcasecmp(str, "RAID5R-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRS; } else if (strcasecmp(str, "RAID5R") == 0 || strcasecmp(str, "RAID5R-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLA; } else if (strcasecmp(str, "RAID5R-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLS; } else return (-1); return (0); } const char * g_raid_get_diskname(struct g_raid_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_consumer->provider->name); } void g_raid_get_disk_info(struct g_raid_disk *disk) { struct g_consumer *cp = disk->d_consumer; int error, len; /* Read kernel dumping information. */ disk->d_kd.offset = 0; disk->d_kd.length = OFF_MAX; len = sizeof(disk->d_kd); error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); if (error) disk->d_kd.di.dumper = NULL; if (disk->d_kd.di.dumper == NULL) G_RAID_DEBUG1(2, disk->d_softc, "Dumping not supported by %s: %d.", cp->provider->name, error); /* Read BIO_DELETE support. */ error = g_getattr("GEOM::candelete", cp, &disk->d_candelete); if (error) disk->d_candelete = 0; if (!disk->d_candelete) G_RAID_DEBUG1(2, disk->d_softc, "BIO_DELETE not supported by %s: %d.", cp->provider->name, error); } void g_raid_report_disk_state(struct g_raid_disk *disk) { struct g_raid_subdisk *sd; int len, state; uint32_t s; if (disk->d_consumer == NULL) return; if (disk->d_state == G_RAID_DISK_S_DISABLED) { s = G_STATE_ACTIVE; /* XXX */ } else if (disk->d_state == G_RAID_DISK_S_FAILED || disk->d_state == G_RAID_DISK_S_STALE_FAILED) { s = G_STATE_FAILED; } else { state = G_RAID_SUBDISK_S_ACTIVE; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { if (sd->sd_state < state) state = sd->sd_state; } if (state == G_RAID_SUBDISK_S_FAILED) s = G_STATE_FAILED; else if (state == G_RAID_SUBDISK_S_NEW || state == G_RAID_SUBDISK_S_REBUILD) s = G_STATE_REBUILD; else if (state == G_RAID_SUBDISK_S_STALE || state == G_RAID_SUBDISK_S_RESYNC) s = G_STATE_RESYNC; else s = G_STATE_ACTIVE; } len = sizeof(s); g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s); G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.", g_raid_get_diskname(disk), s); } void g_raid_change_disk_state(struct g_raid_disk *disk, int state) { G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.", g_raid_get_diskname(disk), g_raid_disk_state2str(disk->d_state), g_raid_disk_state2str(state)); disk->d_state = state; g_raid_report_disk_state(disk); } void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) { G_RAID_DEBUG1(0, sd->sd_softc, "Subdisk %s:%d-%s state changed from %s to %s.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", g_raid_subdisk_state2str(sd->sd_state), g_raid_subdisk_state2str(state)); sd->sd_state = state; if (sd->sd_disk) g_raid_report_disk_state(sd->sd_disk); } void g_raid_change_volume_state(struct g_raid_volume *vol, int state) { G_RAID_DEBUG1(0, vol->v_softc, "Volume %s state changed from %s to %s.", vol->v_name, g_raid_volume_state2str(vol->v_state), g_raid_volume_state2str(state)); vol->v_state = state; } /* * --- Events handling functions --- * Events in geom_raid are used to maintain subdisks and volumes status * from one thread to simplify locking. */ static void g_raid_event_free(struct g_raid_event *ep) { free(ep, M_RAID); } int g_raid_event_send(void *arg, int event, int flags) { struct g_raid_softc *sc; struct g_raid_event *ep; int error; if ((flags & G_RAID_EVENT_VOLUME) != 0) { sc = ((struct g_raid_volume *)arg)->v_softc; } else if ((flags & G_RAID_EVENT_DISK) != 0) { sc = ((struct g_raid_disk *)arg)->d_softc; } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { sc = ((struct g_raid_subdisk *)arg)->sd_softc; } else { sc = arg; } ep = malloc(sizeof(*ep), M_RAID, sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT); if (ep == NULL) return (ENOMEM); ep->e_tgt = arg; ep->e_event = event; ep->e_flags = flags; ep->e_error = 0; G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc); mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); if ((flags & G_RAID_EVENT_WAIT) == 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { mtx_lock(&sc->sc_queue_mtx); MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_raid_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static void g_raid_event_cancel(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep, *tmpep; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if (ep->e_tgt != tgt) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) g_raid_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_queue_mtx); } static int g_raid_event_check(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep; int res = 0; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(ep, &sc->sc_events, e_next) { if (ep->e_tgt != tgt) continue; res = 1; break; } mtx_unlock(&sc->sc_queue_mtx); return (res); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_ndisks(struct g_raid_softc *sc, int state) { struct g_raid_disk *disk; u_int n; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == state || state == -1) n++; } return (n); } /* * Return the number of subdisks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *subdisk; struct g_raid_softc *sc; u_int i, n ; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; for (i = 0; i < vol->v_disks_count; i++) { subdisk = &vol->v_subdisks[i]; if ((state == -1 && subdisk->sd_state != G_RAID_SUBDISK_S_NONE) || subdisk->sd_state == state) n++; } return (n); } /* * Return the first subdisk in given state. * If state is equal to -1, then the first connected disks. */ struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *sd; struct g_raid_softc *sc; u_int i; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((state == -1 && sd->sd_state != G_RAID_SUBDISK_S_NONE) || sd->sd_state == state) return (sd); } return (NULL); } struct g_consumer * g_raid_open_consumer(struct g_raid_softc *sc, const char *name) { struct g_consumer *cp; struct g_provider *pp; g_topology_assert(); if (strncmp(name, "/dev/", 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) return (NULL); cp = g_new_consumer(sc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 1, 1) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } return (cp); } static u_int g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } u_int g_raid_nopens(struct g_raid_softc *sc) { struct g_raid_volume *vol; u_int opens; opens = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_provider_open != 0) opens++; } return (opens); } static int g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid_nrequests(sc, cp) > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert_not(); g_topology_lock(); cp->private = NULL; if (g_raid_consumer_is_busy(sc, cp)) goto out; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); goto out; } G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); out: g_topology_unlock(); } static void g_raid_orphan(struct g_consumer *cp) { struct g_raid_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, G_RAID_EVENT_DISK); } static void g_raid_clean(struct g_raid_volume *vol, int acw) { struct g_raid_softc *sc; int timeout; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; if (!vol->v_dirty) return; if (vol->v_writes > 0) return; if (acw > 0 || (acw == -1 && vol->v_provider != NULL && vol->v_provider->acw > 0)) { timeout = g_raid_clean_time - (time_uptime - vol->v_last_write); if (!g_raid_shutdown && timeout > 0) return; } vol->v_dirty = 0; G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } static void g_raid_dirty(struct g_raid_volume *vol) { struct g_raid_softc *sc; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; vol->v_dirty = 1; G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; sc = vol->v_softc; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) continue; cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_kerneldump_common_done(struct bio *bp) { bp->bio_flags |= BIO_DONE; } int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct bio bp; vol = tr->tro_volume; sc = vol->v_softc; g_reset_bio(&bp); bp.bio_cmd = BIO_WRITE; bp.bio_done = g_raid_tr_kerneldump_common_done; bp.bio_attribute = NULL; bp.bio_offset = offset; bp.bio_length = length; bp.bio_data = virtual; bp.bio_to = vol->v_provider; g_raid_start(&bp); while (!(bp.bio_flags & BIO_DONE)) { G_RAID_DEBUG1(4, sc, "Poll..."); g_raid_poll(sc); DELAY(10); } return (bp.bio_error != 0 ? EIO : 0); } static int g_raid_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; int error; vol = (struct g_raid_volume *)arg; G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.", (long long unsigned)offset, (long long unsigned)length); error = G_RAID_TR_KERNELDUMP(vol->v_tr, virtual, physical, offset, length); return (error); } static void g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) { struct g_kerneldump *gkd; struct g_provider *pp; struct g_raid_volume *vol; gkd = (struct g_kerneldump*)bp->bio_data; pp = bp->bio_to; vol = pp->private; g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); gkd->di.dumper = g_raid_dump; gkd->di.priv = vol; gkd->di.blocksize = vol->v_sectorsize; gkd->di.maxiosize = DFLTPHYS; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > vol->v_mediasize) gkd->length = vol->v_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_raid_candelete(struct g_raid_softc *sc, struct bio *bp) { struct g_provider *pp; struct g_raid_volume *vol; struct g_raid_subdisk *sd; int *val; int i; val = (int *)bp->bio_data; pp = bp->bio_to; vol = pp->private; *val = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) continue; if (sd->sd_disk->d_candelete) { *val = 1; break; } } g_io_deliver(bp, 0); } static void g_raid_start(struct bio *bp) { struct g_raid_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid_start() should not be called at all. */ // KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, // ("Provider's error should be set (error=%d)(mirror=%s).", // bp->bio_to->error, bp->bio_to->name)); G_RAID_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: break; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) g_raid_candelete(sc, bp); else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_raid_kerneldump(sc, bp); else g_io_deliver(bp, EOPNOTSUPP); return; default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) { G_RAID_DEBUG1(4, sc, "Waking up %p.", sc); wakeup(sc); } } static int g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) { /* * 5 cases: * (1) bp entirely below NO * (2) bp entirely above NO * (3) bp start below, but end in range YES * (4) bp entirely within YES * (5) bp starts within, ends above YES * * lock range 10-19 (offset 10 length 10) * (1) 1-5: first if kicks it out * (2) 30-35: second if kicks it out * (3) 5-15: passes both ifs * (4) 12-14: passes both ifs * (5) 19-20: passes both */ off_t lend = lstart + len - 1; off_t bstart = bp->bio_offset; off_t bend = bp->bio_offset + bp->bio_length - 1; if (bend < lstart) return (0); if (lend < bstart) return (0); return (1); } static int g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) { struct g_raid_lock *lp; sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); LIST_FOREACH(lp, &vol->v_locks, l_next) { if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) return (1); } return (0); } static void g_raid_start_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; /* * Check to see if this item is in a locked range. If so, * queue it to our locked queue and return. We'll requeue * it when the range is unlocked. Internal I/O for the * rebuild/rescan/recovery process is excluded from this * check so we can actually do the recovery. */ if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && g_raid_is_in_locked_range(vol, bp)) { G_RAID_LOGREQ(3, bp, "Defer request."); bioq_insert_tail(&vol->v_locked, bp); return; } /* * If we're actually going to do the write/delete, then * update the idle stats for the volume. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { if (!vol->v_dirty) g_raid_dirty(vol); vol->v_writes++; } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. Then tell * the transformation layer to start the I/O. */ bioq_insert_tail(&vol->v_inflight, bp); G_RAID_LOGREQ(4, bp, "Request started"); G_RAID_TR_IOSTART(vol->v_tr, bp); } static void g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) { off_t off, len; struct bio *nbp; struct g_raid_lock *lp; vol->v_pending_lock = 0; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_pending) { off = lp->l_offset; len = lp->l_length; lp->l_pending = 0; TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { if (g_raid_bio_overlaps(nbp, off, len)) lp->l_pending++; } if (lp->l_pending) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock(%jd, %jd) has %d pending", (intmax_t)off, (intmax_t)(off + len), lp->l_pending); continue; } G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock of %jd to %jd completed", (intmax_t)off, (intmax_t)(off + len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); } } } void g_raid_iodone(struct bio *bp, int error) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; G_RAID_LOGREQ(3, bp, "Request done: %d.", error); /* Update stats if we done write/delete. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { vol->v_writes--; vol->v_last_write = time_uptime; } bioq_remove(&vol->v_inflight, bp); if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) g_raid_finish_with_locked_ranges(vol, bp); getmicrouptime(&vol->v_last_done); g_io_deliver(bp, error); } int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, struct bio *ignore, void *argp) { struct g_raid_softc *sc; struct g_raid_lock *lp; struct bio *bp; sc = vol->v_softc; lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); lp->l_offset = off; lp->l_length = len; lp->l_callback_arg = argp; lp->l_pending = 0; TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { if (bp != ignore && g_raid_bio_overlaps(bp, off, len)) lp->l_pending++; } /* * If there are any writes that are pending, we return EBUSY. All * callers will have to wait until all pending writes clear. */ if (lp->l_pending > 0) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend", (intmax_t)off, (intmax_t)(off+len), lp->l_pending); return (EBUSY); } G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd", (intmax_t)off, (intmax_t)(off+len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); return (0); } int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) { struct g_raid_lock *lp; struct g_raid_softc *sc; struct bio *bp; sc = vol->v_softc; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_offset == off && lp->l_length == len) { LIST_REMOVE(lp, l_next); /* XXX * Right now we just put them all back on the queue * and hope for the best. We hope this because any * locked ranges will go right back on this list * when the worker thread runs. * XXX */ G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd", (intmax_t)lp->l_offset, (intmax_t)(lp->l_offset+lp->l_length)); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); free(lp, M_RAID); return (0); } } return (EINVAL); } void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) { struct g_consumer *cp; struct g_raid_disk *disk, *tdisk; bp->bio_caller1 = sd; /* * Make sure that the disk is present. Generally it is a task of * transformation layers to not send requests to absent disks, but * it is better to be safe and report situation then sorry. */ if (sd->sd_disk == NULL) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!"); nodisk: bp->bio_from = NULL; bp->bio_to = NULL; bp->bio_error = ENXIO; g_raid_disk_done(bp); return; } disk = sd->sd_disk; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_FAILED) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); goto nodisk; } cp = disk->d_consumer; bp->bio_from = cp; bp->bio_to = cp->provider; cp->index++; /* Update average disks load. */ TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { if (tdisk->d_consumer == NULL) tdisk->d_load = 0; else tdisk->d_load = (tdisk->d_consumer->index * G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; } disk->d_last_offset = bp->bio_offset + bp->bio_length; if (dumping) { G_RAID_LOGREQ(3, bp, "Sending dumping request."); if (bp->bio_cmd == BIO_WRITE) { bp->bio_error = g_raid_subdisk_kerneldump(sd, bp->bio_data, 0, bp->bio_offset, bp->bio_length); } else bp->bio_error = EOPNOTSUPP; g_raid_disk_done(bp); } else { bp->bio_done = g_raid_disk_done; bp->bio_offset += sd->sd_offset; G_RAID_LOGREQ(3, bp, "Sending request."); g_io_request(bp, cp); } } int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, vm_offset_t physical, off_t offset, size_t length) { if (sd->sd_disk == NULL) return (ENXIO); if (sd->sd_disk->d_kd.di.dumper == NULL) return (EOPNOTSUPP); return (dump_write(&sd->sd_disk->d_kd.di, virtual, physical, sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, length)); } static void g_raid_disk_done(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; sd = bp->bio_caller1; sc = sd->sd_softc; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) wakeup(sc); } static void g_raid_disk_done_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_volume *vol; g_topology_assert_not(); G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); sd = bp->bio_caller1; sc = sd->sd_softc; vol = sd->sd_volume; if (bp->bio_from != NULL) { bp->bio_from->index--; disk = bp->bio_from->private; if (disk == NULL) g_raid_kill_consumer(sc, bp->bio_from); } bp->bio_offset -= sd->sd_offset; G_RAID_TR_IODONE(vol->v_tr, sd, bp); } static void g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) { if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); else ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid_event_free(ep); } else { ep->e_flags |= G_RAID_EVENT_DONE; G_RAID_DEBUG1(4, sc, "Waking up %p.", ep); mtx_lock(&sc->sc_queue_mtx); wakeup(ep); mtx_unlock(&sc->sc_queue_mtx); } } /* * Worker thread. */ static void g_raid_worker(void *arg) { struct g_raid_softc *sc; struct g_raid_event *ep; struct g_raid_volume *vol; struct bio *bp; struct timeval now, t; int timeout, rv; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ bp = NULL; vol = NULL; rv = 0; ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) TAILQ_REMOVE(&sc->sc_events, ep, e_next); else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) ; else { getmicrouptime(&now); t = now; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr && timevalcmp(&vol->v_last_done, &t, < )) t = vol->v_last_done; } timevalsub(&t, &now); timeout = g_raid_idle_threshold + t.tv_sec * 1000000 + t.tv_usec; if (timeout > 0) { /* * Two steps to avoid overflows at HZ=1000 * and idle timeouts > 2.1s. Some rounding * errors can occur, but they are < 1tick, * which is deemed to be close enough for * this purpose. */ int micpertic = 1000000 / hz; timeout = (timeout + micpertic - 1) / micpertic; sx_xunlock(&sc->sc_lock); MSLEEP(rv, sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "-", timeout); sx_xlock(&sc->sc_lock); goto process; } else rv = EWOULDBLOCK; } mtx_unlock(&sc->sc_queue_mtx); process: if (ep != NULL) { g_raid_handle_event(sc, ep); } else if (bp != NULL) { if (bp->bio_to != NULL && bp->bio_to->geom == sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } else if (rv == EWOULDBLOCK) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_clean(vol, -1); if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr) { t.tv_sec = g_raid_idle_threshold / 1000000; t.tv_usec = g_raid_idle_threshold % 1000000; timevaladd(&t, &vol->v_last_done); getmicrouptime(&now); if (timevalcmp(&t, &now, <= )) { G_RAID_TR_IDLE(vol->v_tr); vol->v_last_done = now; } } } } if (sc->sc_stopping == G_RAID_DESTROY_HARD) g_raid_destroy_node(sc, 1); /* May not return. */ } } static void g_raid_poll(struct g_raid_softc *sc) { struct g_raid_event *ep; struct bio *bp; sx_xlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) { TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); g_raid_handle_event(sc, ep); goto out; } bp = bioq_takefirst(&sc->sc_queue); if (bp != NULL) { mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from == NULL || bp->bio_from->geom != sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } out: sx_xunlock(&sc->sc_lock); } static void g_raid_launch_provider(struct g_raid_volume *vol) { struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_softc *sc; struct g_provider *pp; char name[G_RAID_MAX_VOLUMENAME]; off_t off; int i; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); /* Try to name provider with volume name. */ snprintf(name, sizeof(name), "raid/%s", vol->v_name); if (g_raid_name_format == 0 || vol->v_name[0] == 0 || g_provider_by_name(name) != NULL) { /* Otherwise use sequential volume number. */ snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); } pp = g_new_providerf(sc->sc_geom, "%s", name); pp->flags |= G_PF_DIRECT_RECEIVE; if (vol->v_tr->tro_class->trc_accept_unmapped) { pp->flags |= G_PF_ACCEPT_UNMAPPED; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) continue; if ((sd->sd_disk->d_consumer->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } pp->private = vol; pp->mediasize = vol->v_mediasize; pp->sectorsize = vol->v_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE || vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) { if ((disk = vol->v_subdisks[0].sd_disk) != NULL && disk->d_consumer != NULL && disk->d_consumer->provider != NULL) { pp->stripesize = disk->d_consumer->provider->stripesize; off = disk->d_consumer->provider->stripeoffset; pp->stripeoffset = off + vol->v_subdisks[0].sd_offset; if (off > 0) pp->stripeoffset %= off; } if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) { pp->stripesize *= (vol->v_disks_count - 1); pp->stripeoffset *= (vol->v_disks_count - 1); } } else pp->stripesize = vol->v_strip_size; vol->v_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.", pp->name, vol->v_name); } static void g_raid_destroy_provider(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_provider *pp; struct bio *bp, *tmp; g_topology_assert_not(); sc = vol->v_softc; pp = vol->v_provider; KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); g_topology_lock(); g_error_provider(pp, ENXIO); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { if (bp->bio_to != pp) continue; bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.", pp->name, vol->v_name); g_wither_provider(pp, ENXIO); g_topology_unlock(); vol->v_provider = NULL; } /* * Update device state. */ static int g_raid_update_volume(struct g_raid_volume *vol, u_int event) { struct g_raid_softc *sc; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for volume %s.", g_raid_volume_event2str(event), vol->v_name); switch (event) { case G_RAID_VOLUME_E_DOWN: if (vol->v_provider != NULL) g_raid_destroy_provider(vol); break; case G_RAID_VOLUME_E_UP: if (vol->v_provider == NULL) g_raid_launch_provider(vol); break; case G_RAID_VOLUME_E_START: if (vol->v_tr) G_RAID_TR_START(vol->v_tr); return (0); default: if (sc->sc_md) G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event); return (0); } /* Manage root mount release. */ if (vol->v_starting) { vol->v_starting = 0; G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount); root_mount_rel(vol->v_rootmount); vol->v_rootmount = NULL; } if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); return (0); } /* * Update subdisk state. */ static int g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = sd->sd_softc; vol = sd->sd_volume; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.", g_raid_subdisk_event2str(event), vol->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); if (vol->v_tr) G_RAID_TR_EVENT(vol->v_tr, sd, event); return (0); } /* * Update disk state. */ static int g_raid_update_disk(struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for disk %s.", g_raid_disk_event2str(event), g_raid_get_diskname(disk)); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, disk, event); return (0); } /* * Node event. */ static int g_raid_update_node(struct g_raid_softc *sc, u_int event) { sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for the array.", g_raid_node_event2str(event)); if (event == G_RAID_NODE_E_WAKE) return (0); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, NULL, event); return (0); } static int g_raid_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid_volume *vol; struct g_raid_softc *sc; int dcw, opens, error = 0; g_topology_assert(); sc = pp->geom->softc; vol = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcw = pp->acw + acw; g_topology_unlock(); sx_xlock(&sc->sc_lock); /* Deny new opens while dying. */ if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { error = ENXIO; goto out; } /* Deny write opens for read-only volumes. */ if (vol->v_read_only && acw > 0) { error = EROFS; goto out; } if (dcw == 0) g_raid_clean(vol, dcw); vol->v_provider_open += acr + acw + ace; /* Handle delayed node destruction. */ if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && vol->v_provider_open == 0) { /* Count open volumes. */ opens = g_raid_nopens(sc); if (opens == 0) { sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } } /* Handle open volume destruction. */ if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); out: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md) { struct g_raid_softc *sc; struct g_geom *gp; int error; g_topology_assert(); G_RAID_DEBUG(1, "Creating array %s.", name); gp = g_new_geomf(mp, "%s", name); sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); gp->start = g_raid_start; gp->orphan = g_raid_orphan; gp->access = g_raid_access; gp->dumpconf = g_raid_dumpconf; sc->sc_md = md; sc->sc_geom = gp; sc->sc_flags = 0; TAILQ_INIT(&sc->sc_volumes); TAILQ_INIT(&sc->sc_disks); sx_init(&sc->sc_lock, "graid:lock"); mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_events); bioq_init(&sc->sc_queue); gp->softc = sc; error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, "g_raid %s", name); if (error != 0) { G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_RAID); return (NULL); } G_RAID_DEBUG1(0, sc, "Array %s created.", name); return (sc); } struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id) { struct g_raid_volume *vol, *vol1; int i; G_RAID_DEBUG1(1, sc, "Creating volume %s.", name); vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); vol->v_softc = sc; strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); vol->v_state = G_RAID_VOLUME_S_STARTING; vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN; vol->v_rotate_parity = 1; bioq_init(&vol->v_inflight); bioq_init(&vol->v_locked); LIST_INIT(&vol->v_locks); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { vol->v_subdisks[i].sd_softc = sc; vol->v_subdisks[i].sd_volume = vol; vol->v_subdisks[i].sd_pos = i; vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; } /* Find free ID for this volume. */ g_topology_lock(); vol1 = vol; if (id >= 0) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } } if (vol1 != NULL) { for (id = 0; ; id++) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } if (vol1 == NULL) break; } } vol->v_global_id = id; LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); g_topology_unlock(); /* Delay root mounting. */ vol->v_rootmount = root_mount_hold("GRAID"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount); vol->v_starting = 1; TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); return (vol); } struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc) { struct g_raid_disk *disk; G_RAID_DEBUG1(1, sc, "Creating disk."); disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); disk->d_softc = sc; disk->d_state = G_RAID_DISK_S_NONE; TAILQ_INIT(&disk->d_subdisks); TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); return (disk); } int g_raid_start_volume(struct g_raid_volume *vol) { struct g_raid_tr_class *class; struct g_raid_tr_object *obj; int status; G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name); LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { if (!class->trc_enable) continue; G_RAID_DEBUG1(2, vol->v_softc, "Tasting volume %s for %s transformation.", vol->v_name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->tro_class = class; obj->tro_volume = vol; status = G_RAID_TR_TASTE(obj, vol); if (status != G_RAID_TR_TASTE_FAIL) break; kobj_delete((kobj_t)obj, M_RAID); } if (class == NULL) { G_RAID_DEBUG1(0, vol->v_softc, "No transformation module found for %s.", vol->v_name); vol->v_tr = NULL; g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); return (-1); } G_RAID_DEBUG1(2, vol->v_softc, "Transformation module %s chosen for %s.", class->name, vol->v_name); vol->v_tr = obj; return (0); } int g_raid_destroy_node(struct g_raid_softc *sc, int worker) { struct g_raid_volume *vol, *tmpv; struct g_raid_disk *disk, *tmpd; int error = 0; sc->sc_stopping = G_RAID_DESTROY_HARD; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { if (g_raid_destroy_volume(vol)) error = EBUSY; } if (error) return (error); TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { if (g_raid_destroy_disk(disk)) error = EBUSY; } if (error) return (error); if (sc->sc_md) { G_RAID_MD_FREE(sc->sc_md); kobj_delete((kobj_t)sc->sc_md, M_RAID); sc->sc_md = NULL; } if (sc->sc_geom != NULL) { G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name); g_topology_lock(); sc->sc_geom->softc = NULL; g_wither_geom(sc->sc_geom, ENXIO); g_topology_unlock(); sc->sc_geom = NULL; } else G_RAID_DEBUG(1, "Array destroyed."); if (worker) { g_raid_event_cancel(sc, sc); mtx_destroy(&sc->sc_queue_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); wakeup(&sc->sc_stopping); free(sc, M_RAID); curthread->td_pflags &= ~TDP_GEOM; G_RAID_DEBUG(1, "Thread exiting."); kproc_exit(0); } else { /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_volume(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_disk *disk; int i; sc = vol->v_softc; G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name); vol->v_stopping = 1; if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { if (vol->v_tr) { G_RAID_TR_STOP(vol->v_tr); return (EBUSY); } else vol->v_state = G_RAID_VOLUME_S_STOPPED; } if (g_raid_event_check(sc, vol) != 0) return (EBUSY); if (vol->v_provider != NULL) return (EBUSY); if (vol->v_provider_open != 0) return (EBUSY); if (vol->v_tr) { G_RAID_TR_FREE(vol->v_tr); kobj_delete((kobj_t)vol->v_tr, M_RAID); vol->v_tr = NULL; } if (vol->v_rootmount) root_mount_rel(vol->v_rootmount); g_topology_lock(); LIST_REMOVE(vol, v_global_next); g_topology_unlock(); TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { g_raid_event_cancel(sc, &vol->v_subdisks[i]); disk = vol->v_subdisks[i].sd_disk; if (disk == NULL) continue; TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); } G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name); if (sc->sc_md) G_RAID_MD_FREE_VOLUME(sc->sc_md, vol); g_raid_event_cancel(sc, vol); free(vol, M_RAID); if (sc->sc_stopping == G_RAID_DESTROY_HARD) { /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmp; sc = disk->d_softc; G_RAID_DEBUG1(2, sc, "Destroying disk."); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); sd->sd_disk = NULL; } TAILQ_REMOVE(&sc->sc_disks, disk, d_next); if (sc->sc_md) G_RAID_MD_FREE_DISK(sc->sc_md, disk); g_raid_event_cancel(sc, disk); free(disk, M_RAID); return (0); } int g_raid_destroy(struct g_raid_softc *sc, int how) { int error, opens; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); /* Count open volumes. */ opens = g_raid_nopens(sc); /* React on some opened volumes. */ if (opens > 0) { switch (how) { case G_RAID_DESTROY_SOFT: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); sx_xunlock(&sc->sc_lock); return (EBUSY); case G_RAID_DESTROY_DELAYED: G_RAID_DEBUG1(1, sc, "Array will be destroyed on last close."); sc->sc_stopping = G_RAID_DESTROY_DELAYED; sx_xunlock(&sc->sc_lock); return (EBUSY); case G_RAID_DESTROY_HARD: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); } } /* Mark node for destruction. */ sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); /* Sleep until node destroyed. */ error = sx_sleep(&sc->sc_stopping, &sc->sc_lock, PRIBIO | PDROP, "r:destroy", hz * 3); return (error == EWOULDBLOCK ? EBUSY : 0); } static void g_raid_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp, *geom; struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); if (!g_raid_enable) return (NULL); G_RAID_DEBUG(2, "Tasting provider %s.", pp->name); geom = NULL; status = G_RAID_MD_TASTE_FAIL; gp = g_new_geomf(mp, "raid:taste"); /* * This orphan function should be never called. */ gp->orphan = g_raid_taste_orphan; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_RECEIVE; g_attach(cp, pp); if (g_access(cp, 1, 0, 0) != 0) goto ofail; LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (!class->mdc_enable) continue; G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.", pp->name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_TASTE(obj, mp, cp, &geom); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); if (status != G_RAID_MD_TASTE_FAIL) break; } if (status == G_RAID_MD_TASTE_FAIL) (void)g_access(cp, -1, 0, 0); ofail: g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name); return (geom); } int g_raid_create_node_format(const char *format, struct gctl_req *req, struct g_geom **gp) { struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; G_RAID_DEBUG(2, "Creating array for %s metadata.", format); LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (strcasecmp(class->name, format) == 0) break; } if (class == NULL) { G_RAID_DEBUG(1, "No support for %s metadata.", format); return (G_RAID_MD_TASTE_FAIL); } obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); return (status); } static int g_raid_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); g_topology_lock(); return (error); } void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (sc->sc_stopping == G_RAID_DESTROY_HARD) return; if (sc->sc_md) G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); } void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (disk == NULL) disk = sd->sd_disk; if (disk == NULL) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!"); return; } if (disk->d_state != G_RAID_DISK_S_ACTIVE) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); return; } if (sc->sc_md) G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); } static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, s; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { vol = pp->private; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s %s volume\n", indent, sc->sc_md->mdo_class->name, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s\n", indent, vol->v_name); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s%s\n", indent, vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); sbuf_printf(sb, "%s%u\n", indent, vol->v_disks_count); sbuf_printf(sb, "%s%u\n", indent, vol->v_strip_size); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(vol->v_state)); sbuf_printf(sb, "%s%s\n", indent, vol->v_dirty ? "Yes" : "No"); sbuf_printf(sb, "%s", indent); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk != NULL && sd->sd_disk->d_consumer != NULL) { sbuf_printf(sb, "%s ", g_raid_get_diskname(sd->sd_disk)); } else { sbuf_printf(sb, "NONE "); } sbuf_printf(sb, "(%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } sbuf_printf(sb, ")"); if (i + 1 < vol->v_disks_count) sbuf_printf(sb, ", "); } sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else if (cp != NULL) { disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s", indent, g_raid_disk_state2str(disk->d_state)); if (!TAILQ_EMPTY(&disk->d_subdisks)) { sbuf_printf(sb, " ("); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } if (TAILQ_NEXT(sd, sd_next)) sbuf_printf(sb, ", "); } sbuf_printf(sb, ")"); } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s", indent); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "r%d(%s):%d@%ju", sd->sd_volume->v_global_id, sd->sd_volume->v_name, sd->sd_pos, sd->sd_offset); if (TAILQ_NEXT(sd, sd_next)) sbuf_printf(sb, ", "); } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%d\n", indent, disk->d_read_errs); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (sc->sc_md) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_md->mdo_class->name); } if (!TAILQ_EMPTY(&sc->sc_volumes)) { s = 0xff; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_state < s) s = vol->v_state; } sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(s)); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid_softc *sc; struct g_raid_volume *vol; mp = arg; g_topology_lock(); g_raid_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) g_raid_clean(vol, -1); g_cancel_event(sc); g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); g_topology_lock(); } g_topology_unlock(); } static void g_raid_init(struct g_class *mp) { g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid_post_sync == NULL) G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); g_raid_started = 1; } static void g_raid_fini(struct g_class *mp) { if (g_raid_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync); g_raid_started = 0; } int g_raid_md_modevent(module_t mod, int type, void *arg) { struct g_raid_md_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_md_classes); if (c == NULL || c->mdc_priority > class->mdc_priority) LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); else { while ((nc = LIST_NEXT(c, mdc_list)) != NULL && nc->mdc_priority < class->mdc_priority) c = nc; LIST_INSERT_AFTER(c, class, mdc_list); } if (g_raid_started) g_retaste(&g_raid_class); break; case MOD_UNLOAD: LIST_REMOVE(class, mdc_list); break; default: error = EOPNOTSUPP; break; } return (error); } int g_raid_tr_modevent(module_t mod, int type, void *arg) { struct g_raid_tr_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_tr_classes); if (c == NULL || c->trc_priority > class->trc_priority) LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); else { while ((nc = LIST_NEXT(c, trc_list)) != NULL && nc->trc_priority < class->trc_priority) c = nc; LIST_INSERT_AFTER(c, class, trc_list); } break; case MOD_UNLOAD: LIST_REMOVE(class, trc_list); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) * to reduce module priority, allowing submodules to register them first. */ static moduledata_t g_raid_mod = { "g_raid", g_modevent, &g_raid_class }; DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); MODULE_VERSION(geom_raid, 0); Index: head/sys/geom/raid/g_raid.h =================================================================== --- head/sys/geom/raid/g_raid.h (revision 326269) +++ head/sys/geom/raid/g_raid.h (revision 326270) @@ -1,469 +1,471 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_RAID_H_ #define _G_RAID_H_ #include #include #include #include #ifdef _KERNEL #include #endif #define G_RAID_CLASS_NAME "RAID" #define G_RAID_MAGIC "GEOM::RAID" #define G_RAID_VERSION 0 struct g_raid_md_object; struct g_raid_tr_object; #define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL #define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \ G_RAID_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL extern u_int g_raid_aggressive_spare; extern u_int g_raid_debug; extern int g_raid_enable; extern int g_raid_read_err_thresh; extern u_int g_raid_start_timeout; extern struct g_class g_raid_class; #define G_RAID_DEBUG(lvl, fmt, ...) do { \ if (g_raid_debug >= (lvl)) { \ if (g_raid_debug > 0) { \ printf("GEOM_RAID[%u]: " fmt "\n", \ lvl, ## __VA_ARGS__); \ } else { \ printf("GEOM_RAID: " fmt "\n", \ ## __VA_ARGS__); \ } \ } \ } while (0) #define G_RAID_DEBUG1(lvl, sc, fmt, ...) do { \ if (g_raid_debug >= (lvl)) { \ if (g_raid_debug > 0) { \ printf("GEOM_RAID[%u]: %s: " fmt "\n", \ lvl, (sc)->sc_name, ## __VA_ARGS__); \ } else { \ printf("GEOM_RAID: %s: " fmt "\n", \ (sc)->sc_name, ## __VA_ARGS__); \ } \ } \ } while (0) #define G_RAID_LOGREQ(lvl, bp, fmt, ...) do { \ if (g_raid_debug >= (lvl)) { \ if (g_raid_debug > 0) { \ printf("GEOM_RAID[%u]: " fmt " ", \ lvl, ## __VA_ARGS__); \ } else \ printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) /* * Flags we use to distinguish I/O initiated by the TR layer to maintain * the volume's characteristics, fix subdisks, extra copies of data, etc. * * G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data * for RAID volumes that maintain extra data * and need to rebuild that data. * G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into * doing some desirable action such as bad * block remapping after we detect a bad part * of the disk. * G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released. * * and the following meta item: * G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it * through the range locking which would * otherwise defer the I/O until after that * range is unlocked. */ #define G_RAID_BIO_FLAG_SYNC 0x01 #define G_RAID_BIO_FLAG_REMAP 0x02 #define G_RAID_BIO_FLAG_SPECIAL \ (G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP) #define G_RAID_BIO_FLAG_LOCKED 0x80 struct g_raid_lock { off_t l_offset; off_t l_length; void *l_callback_arg; int l_pending; LIST_ENTRY(g_raid_lock) l_next; }; #define G_RAID_EVENT_WAIT 0x01 #define G_RAID_EVENT_VOLUME 0x02 #define G_RAID_EVENT_SUBDISK 0x04 #define G_RAID_EVENT_DISK 0x08 #define G_RAID_EVENT_DONE 0x10 struct g_raid_event { void *e_tgt; int e_event; int e_flags; int e_error; TAILQ_ENTRY(g_raid_event) e_next; }; #define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */ #define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */ #define G_RAID_DISK_S_DISABLED 0x02 /* Disabled. */ #define G_RAID_DISK_S_FAILED 0x03 /* Failed. */ #define G_RAID_DISK_S_STALE_FAILED 0x04 /* Old failed. */ #define G_RAID_DISK_S_SPARE 0x05 /* Hot-spare. */ #define G_RAID_DISK_S_STALE 0x06 /* Old disk, unused now. */ #define G_RAID_DISK_S_ACTIVE 0x07 /* Operational. */ #define G_RAID_DISK_E_DISCONNECTED 0x01 struct g_raid_disk { struct g_raid_softc *d_softc; /* Back-pointer to softc. */ struct g_consumer *d_consumer; /* GEOM disk consumer. */ void *d_md_data; /* Disk's metadata storage. */ struct g_kerneldump d_kd; /* Kernel dumping method/args. */ int d_candelete; /* BIO_DELETE supported. */ uint64_t d_flags; /* Additional flags. */ u_int d_state; /* Disk state. */ u_int d_load; /* Disk average load. */ off_t d_last_offset; /* Last head offset. */ int d_read_errs; /* Count of the read errors */ TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */ TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */ }; #define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */ #define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */ #define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */ #define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */ #define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */ #define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */ #define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */ #define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */ #define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */ #define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */ #define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */ #define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */ #define G_RAID_SUBDISK_POS(sd) \ ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0) #define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024) #define G_RAID_SUBDISK_LOAD(sd) \ ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0) #define G_RAID_SUBDISK_LOAD_SCALE 256 struct g_raid_subdisk { struct g_raid_softc *sd_softc; /* Back-pointer to softc. */ struct g_raid_disk *sd_disk; /* Where this subdisk lives. */ struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */ off_t sd_offset; /* Offset on the disk. */ off_t sd_size; /* Size on the disk. */ u_int sd_pos; /* Position in volume. */ u_int sd_state; /* Subdisk state. */ off_t sd_rebuild_pos; /* Rebuild position. */ int sd_recovery; /* Count of recovery reqs. */ TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */ }; #define G_RAID_MAX_SUBDISKS 16 #define G_RAID_MAX_VOLUMENAME 32 #define G_RAID_VOLUME_S_STARTING 0x00 #define G_RAID_VOLUME_S_BROKEN 0x01 #define G_RAID_VOLUME_S_DEGRADED 0x02 #define G_RAID_VOLUME_S_SUBOPTIMAL 0x03 #define G_RAID_VOLUME_S_OPTIMAL 0x04 #define G_RAID_VOLUME_S_UNSUPPORTED 0x05 #define G_RAID_VOLUME_S_STOPPED 0x06 #define G_RAID_VOLUME_S_ALIVE(s) \ ((s) == G_RAID_VOLUME_S_DEGRADED || \ (s) == G_RAID_VOLUME_S_SUBOPTIMAL || \ (s) == G_RAID_VOLUME_S_OPTIMAL) #define G_RAID_VOLUME_E_DOWN 0x00 #define G_RAID_VOLUME_E_UP 0x01 #define G_RAID_VOLUME_E_START 0x10 #define G_RAID_VOLUME_E_STARTMD 0x11 #define G_RAID_VOLUME_RL_RAID0 0x00 #define G_RAID_VOLUME_RL_RAID1 0x01 #define G_RAID_VOLUME_RL_RAID3 0x03 #define G_RAID_VOLUME_RL_RAID4 0x04 #define G_RAID_VOLUME_RL_RAID5 0x05 #define G_RAID_VOLUME_RL_RAID6 0x06 #define G_RAID_VOLUME_RL_RAIDMDF 0x07 #define G_RAID_VOLUME_RL_RAID1E 0x11 #define G_RAID_VOLUME_RL_SINGLE 0x0f #define G_RAID_VOLUME_RL_CONCAT 0x1f #define G_RAID_VOLUME_RL_RAID5E 0x15 #define G_RAID_VOLUME_RL_RAID5EE 0x25 #define G_RAID_VOLUME_RL_RAID5R 0x35 #define G_RAID_VOLUME_RL_UNKNOWN 0xff #define G_RAID_VOLUME_RLQ_NONE 0x00 #define G_RAID_VOLUME_RLQ_R1SM 0x00 #define G_RAID_VOLUME_RLQ_R1MM 0x01 #define G_RAID_VOLUME_RLQ_R3P0 0x00 #define G_RAID_VOLUME_RLQ_R3PN 0x01 #define G_RAID_VOLUME_RLQ_R4P0 0x00 #define G_RAID_VOLUME_RLQ_R4PN 0x01 #define G_RAID_VOLUME_RLQ_R5RA 0x00 #define G_RAID_VOLUME_RLQ_R5RS 0x01 #define G_RAID_VOLUME_RLQ_R5LA 0x02 #define G_RAID_VOLUME_RLQ_R5LS 0x03 #define G_RAID_VOLUME_RLQ_R6RA 0x00 #define G_RAID_VOLUME_RLQ_R6RS 0x01 #define G_RAID_VOLUME_RLQ_R6LA 0x02 #define G_RAID_VOLUME_RLQ_R6LS 0x03 #define G_RAID_VOLUME_RLQ_RMDFRA 0x00 #define G_RAID_VOLUME_RLQ_RMDFRS 0x01 #define G_RAID_VOLUME_RLQ_RMDFLA 0x02 #define G_RAID_VOLUME_RLQ_RMDFLS 0x03 #define G_RAID_VOLUME_RLQ_R1EA 0x00 #define G_RAID_VOLUME_RLQ_R1EO 0x01 #define G_RAID_VOLUME_RLQ_R5ERA 0x00 #define G_RAID_VOLUME_RLQ_R5ERS 0x01 #define G_RAID_VOLUME_RLQ_R5ELA 0x02 #define G_RAID_VOLUME_RLQ_R5ELS 0x03 #define G_RAID_VOLUME_RLQ_R5EERA 0x00 #define G_RAID_VOLUME_RLQ_R5EERS 0x01 #define G_RAID_VOLUME_RLQ_R5EELA 0x02 #define G_RAID_VOLUME_RLQ_R5EELS 0x03 #define G_RAID_VOLUME_RLQ_R5RRA 0x00 #define G_RAID_VOLUME_RLQ_R5RRS 0x01 #define G_RAID_VOLUME_RLQ_R5RLA 0x02 #define G_RAID_VOLUME_RLQ_R5RLS 0x03 #define G_RAID_VOLUME_RLQ_UNKNOWN 0xff struct g_raid_volume; struct g_raid_volume { struct g_raid_softc *v_softc; /* Back-pointer to softc. */ struct g_provider *v_provider; /* GEOM provider. */ struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS]; /* Subdisks of this volume. */ void *v_md_data; /* Volume's metadata storage. */ struct g_raid_tr_object *v_tr; /* Transformation object. */ char v_name[G_RAID_MAX_VOLUMENAME]; /* Volume name. */ u_int v_state; /* Volume state. */ u_int v_raid_level; /* Array RAID level. */ u_int v_raid_level_qualifier; /* RAID level det. */ u_int v_disks_count; /* Number of disks in array. */ u_int v_mdf_pdisks; /* Number of parity disks in RAIDMDF array. */ uint16_t v_mdf_polynomial; /* Polynomial for RAIDMDF. */ uint8_t v_mdf_method; /* Generation method for RAIDMDF. */ u_int v_strip_size; /* Array strip size. */ u_int v_rotate_parity; /* Rotate RAID5R parity after numer of stripes. */ u_int v_sectorsize; /* Volume sector size. */ off_t v_mediasize; /* Volume media size. */ struct bio_queue_head v_inflight; /* In-flight write requests. */ struct bio_queue_head v_locked; /* Blocked I/O requests. */ LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */ int v_pending_lock; /* writes to locked region */ int v_dirty; /* Volume is DIRTY. */ struct timeval v_last_done; /* Time of the last I/O. */ time_t v_last_write; /* Time of the last write. */ u_int v_writes; /* Number of active writes. */ struct root_hold_token *v_rootmount; /* Root mount delay token. */ int v_starting; /* Volume is starting */ int v_stopping; /* Volume is stopping */ int v_provider_open; /* Number of opens. */ int v_global_id; /* Global volume ID (rX). */ int v_read_only; /* Volume is read-only. */ TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */ LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */ }; #define G_RAID_NODE_E_WAKE 0x00 #define G_RAID_NODE_E_START 0x01 struct g_raid_softc { struct g_raid_md_object *sc_md; /* Metadata object. */ struct g_geom *sc_geom; /* GEOM class instance. */ uint64_t sc_flags; /* Additional flags. */ TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */ TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */ struct sx sc_lock; /* Main node lock. */ struct proc *sc_worker; /* Worker process. */ struct mtx sc_queue_mtx; /* Worker queues lock. */ TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */ struct bio_queue_head sc_queue; /* Worker I/O queue. */ int sc_stopping; /* Node is stopping */ }; #define sc_name sc_geom->name SYSCTL_DECL(_kern_geom_raid); /* * KOBJ parent class of metadata processing modules. */ struct g_raid_md_class { KOBJ_CLASS_FIELDS; int mdc_enable; int mdc_priority; LIST_ENTRY(g_raid_md_class) mdc_list; }; /* * KOBJ instance of metadata processing module. */ struct g_raid_md_object { KOBJ_FIELDS; struct g_raid_md_class *mdo_class; struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */ }; int g_raid_md_modevent(module_t, int, void *); #define G_RAID_MD_DECLARE(name, label) \ static moduledata_t g_raid_md_##name##_mod = { \ "g_raid_md_" __XSTRING(name), \ g_raid_md_modevent, \ &g_raid_md_##name##_class \ }; \ DECLARE_MODULE(g_raid_md_##name, g_raid_md_##name##_mod, \ SI_SUB_DRIVERS, SI_ORDER_SECOND); \ MODULE_DEPEND(g_raid_md_##name, geom_raid, 0, 0, 0); \ SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, CTLFLAG_RD, \ NULL, label " metadata module"); \ SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \ CTLFLAG_RWTUN, &g_raid_md_##name##_class.mdc_enable, 0, \ "Enable " label " metadata format taste") /* * KOBJ parent class of data transformation modules. */ struct g_raid_tr_class { KOBJ_CLASS_FIELDS; int trc_enable; int trc_priority; int trc_accept_unmapped; LIST_ENTRY(g_raid_tr_class) trc_list; }; /* * KOBJ instance of data transformation module. */ struct g_raid_tr_object { KOBJ_FIELDS; struct g_raid_tr_class *tro_class; struct g_raid_volume *tro_volume; /* Back-pointer to volume. */ }; int g_raid_tr_modevent(module_t, int, void *); #define G_RAID_TR_DECLARE(name, label) \ static moduledata_t g_raid_tr_##name##_mod = { \ "g_raid_tr_" __XSTRING(name), \ g_raid_tr_modevent, \ &g_raid_tr_##name##_class \ }; \ DECLARE_MODULE(g_raid_tr_##name, g_raid_tr_##name##_mod, \ SI_SUB_DRIVERS, SI_ORDER_FIRST); \ MODULE_DEPEND(g_raid_tr_##name, geom_raid, 0, 0, 0); \ SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, CTLFLAG_RD, \ NULL, label " transformation module"); \ SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \ CTLFLAG_RWTUN, &g_raid_tr_##name##_class.trc_enable, 0, \ "Enable " label " transformation module taste") const char * g_raid_volume_level2str(int level, int qual); int g_raid_volume_str2level(const char *str, int *level, int *qual); const char * g_raid_volume_state2str(int state); const char * g_raid_subdisk_state2str(int state); const char * g_raid_disk_state2str(int state); struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md); int g_raid_create_node_format(const char *format, struct gctl_req *req, struct g_geom **gp); struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id); struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc); const char * g_raid_get_diskname(struct g_raid_disk *disk); void g_raid_get_disk_info(struct g_raid_disk *disk); int g_raid_start_volume(struct g_raid_volume *vol); int g_raid_destroy_node(struct g_raid_softc *sc, int worker); int g_raid_destroy_volume(struct g_raid_volume *vol); int g_raid_destroy_disk(struct g_raid_disk *disk); void g_raid_iodone(struct bio *bp, int error); void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp); int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, vm_offset_t physical, off_t offset, size_t length); struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc, const char *name); void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp); void g_raid_report_disk_state(struct g_raid_disk *disk); void g_raid_change_disk_state(struct g_raid_disk *disk, int state); void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state); void g_raid_change_volume_state(struct g_raid_volume *vol, int state); void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk); void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk); void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp); int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length); u_int g_raid_ndisks(struct g_raid_softc *sc, int state); u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state); u_int g_raid_nopens(struct g_raid_softc *sc); struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, int state); #define G_RAID_DESTROY_SOFT 0 #define G_RAID_DESTROY_DELAYED 1 #define G_RAID_DESTROY_HARD 2 int g_raid_destroy(struct g_raid_softc *sc, int how); int g_raid_event_send(void *arg, int event, int flags); int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, struct bio *ignore, void *argp); int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len); g_ctl_req_t g_raid_ctl; #endif /* _KERNEL */ #endif /* !_G_RAID_H_ */ Index: head/sys/geom/raid/g_raid_ctl.c =================================================================== --- head/sys/geom/raid/g_raid_ctl.c (revision 326269) +++ head/sys/geom/raid/g_raid_ctl.c (revision 326270) @@ -1,249 +1,251 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_raid_md_if.h" static struct g_raid_softc * g_raid_find_node(struct g_class *mp, const char *name) { struct g_raid_softc *sc; struct g_geom *gp; struct g_provider *pp; struct g_raid_volume *vol; /* Look for geom with specified name. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (strcasecmp(sc->sc_name, name) == 0) return (sc); } /* Look for provider with specified name. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (strcmp(pp->name, name) == 0) return (sc); if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, name) == 0) return (sc); } } /* Look for volume with specified name. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, name) == 0) return (sc); } } return (NULL); } static void g_raid_ctl_label(struct gctl_req *req, struct g_class *mp) { struct g_geom *geom; struct g_raid_softc *sc; const char *format; int *nargs; int crstatus, ctlstatus; char buf[64]; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return; } format = gctl_get_asciiparam(req, "arg0"); if (format == NULL) { gctl_error(req, "No format received."); return; } crstatus = g_raid_create_node_format(format, req, &geom); if (crstatus == G_RAID_MD_TASTE_FAIL) { gctl_error(req, "Failed to create array with format '%s'.", format); return; } sc = (struct g_raid_softc *)geom->softc; g_topology_unlock(); sx_xlock(&sc->sc_lock); ctlstatus = G_RAID_MD_CTL(sc->sc_md, req); if (ctlstatus < 0) { gctl_error(req, "Command failed: %d.", ctlstatus); if (crstatus == G_RAID_MD_TASTE_NEW) g_raid_destroy_node(sc, 0); } else { if (crstatus == G_RAID_MD_TASTE_NEW) snprintf(buf, sizeof(buf), "%s created\n", sc->sc_name); else snprintf(buf, sizeof(buf), "%s reused\n", sc->sc_name); gctl_set_param_err(req, "output", buf, strlen(buf) + 1); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } static void g_raid_ctl_stop(struct gctl_req *req, struct g_class *mp) { struct g_raid_softc *sc; const char *nodename; int *nargs, *force; int error, how; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } nodename = gctl_get_asciiparam(req, "arg0"); if (nodename == NULL) { gctl_error(req, "No array name received."); return; } sc = g_raid_find_node(mp, nodename); if (sc == NULL) { gctl_error(req, "Array '%s' not found.", nodename); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force) how = G_RAID_DESTROY_HARD; else how = G_RAID_DESTROY_SOFT; g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid_destroy(sc, how); if (error != 0) gctl_error(req, "Array is busy."); g_topology_lock(); } static void g_raid_ctl_other(struct gctl_req *req, struct g_class *mp) { struct g_raid_softc *sc; const char *nodename; int *nargs; int ctlstatus; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments."); return; } nodename = gctl_get_asciiparam(req, "arg0"); if (nodename == NULL) { gctl_error(req, "No array name received."); return; } sc = g_raid_find_node(mp, nodename); if (sc == NULL) { gctl_error(req, "Array '%s' not found.", nodename); return; } g_topology_unlock(); sx_xlock(&sc->sc_lock); if (sc->sc_md != NULL) { ctlstatus = G_RAID_MD_CTL(sc->sc_md, req); if (ctlstatus < 0) gctl_error(req, "Command failed: %d.", ctlstatus); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } void g_raid_ctl(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_RAID_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "label") == 0) g_raid_ctl_label(req, mp); else if (strcmp(verb, "stop") == 0) g_raid_ctl_stop(req, mp); else g_raid_ctl_other(req, mp); } Index: head/sys/geom/raid/md_ddf.c =================================================================== --- head/sys/geom/raid/md_ddf.c (revision 326269) +++ head/sys/geom/raid/md_ddf.c (revision 326270) @@ -1,3095 +1,3097 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "geom/raid/md_ddf.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata"); #define DDF_MAX_DISKS_HARD 128 #define DDF_MAX_DISKS 16 #define DDF_MAX_VDISKS 7 #define DDF_MAX_PARTITIONS 1 #define DECADE (3600*24*(365*10+2)) /* 10 years in seconds. */ struct ddf_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_pd_record *pdr; struct ddf_vd_record *vdr; void *cr; struct ddf_pdd_record *pdd; struct ddf_bbm_log *bbm; }; struct ddf_vol_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD]; }; struct g_raid_md_ddf_perdisk { struct ddf_meta pd_meta; }; struct g_raid_md_ddf_pervolume { struct ddf_vol_meta pv_meta; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; struct g_raid_md_ddf_object { struct g_raid_md_object mdio_base; u_int mdio_bigendian; struct ddf_meta mdio_meta; int mdio_starting; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_started; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_req_t g_raid_md_create_req_ddf; static g_raid_md_taste_t g_raid_md_taste_ddf; static g_raid_md_event_t g_raid_md_event_ddf; static g_raid_md_volume_event_t g_raid_md_volume_event_ddf; static g_raid_md_ctl_t g_raid_md_ctl_ddf; static g_raid_md_write_t g_raid_md_write_ddf; static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf; static g_raid_md_free_disk_t g_raid_md_free_disk_ddf; static g_raid_md_free_volume_t g_raid_md_free_volume_ddf; static g_raid_md_free_t g_raid_md_free_ddf; static kobj_method_t g_raid_md_ddf_methods[] = { KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf), KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf), KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf), KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf), { 0, 0 } }; static struct g_raid_md_class g_raid_md_ddf_class = { "DDF", g_raid_md_ddf_methods, sizeof(struct g_raid_md_ddf_object), .mdc_enable = 1, .mdc_priority = 100 }; #define GET8(m, f) ((m)->f) #define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f)) #define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f)) #define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f)) #define GET8D(m, f) (f) #define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f)) #define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f)) #define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f)) #define GET8P(m, f) (*(f)) #define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f)) #define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f)) #define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f)) #define SET8P(m, f, v) \ (*(f) = (v)) #define SET16P(m, f, v) \ do { \ if ((m)->bigendian) \ be16enc((f), (v)); \ else \ le16enc((f), (v)); \ } while (0) #define SET32P(m, f, v) \ do { \ if ((m)->bigendian) \ be32enc((f), (v)); \ else \ le32enc((f), (v)); \ } while (0) #define SET64P(m, f, v) \ do { \ if ((m)->bigendian) \ be64enc((f), (v)); \ else \ le64enc((f), (v)); \ } while (0) #define SET8(m, f, v) SET8P((m), &((m)->f), (v)) #define SET16(m, f, v) SET16P((m), &((m)->f), (v)) #define SET32(m, f, v) SET32P((m), &((m)->f), (v)) #define SET64(m, f, v) SET64P((m), &((m)->f), (v)) #define SET8D(m, f, v) SET8P((m), &(f), (v)) #define SET16D(m, f, v) SET16P((m), &(f), (v)) #define SET32D(m, f, v) SET32P((m), &(f), (v)) #define SET64D(m, f, v) SET64P((m), &(f), (v)) #define GETCRNUM(m) (GET32((m), hdr->cr_length) / \ GET16((m), hdr->Configuration_Record_Length)) #define GETVDCPTR(m, n) ((struct ddf_vdc_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) #define GETSAPTR(m, n) ((struct ddf_sa_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) static int isff(uint8_t *buf, int size) { int i; for (i = 0; i < size; i++) if (buf[i] != 0xff) return (0); return (1); } static void print_guid(uint8_t *buf) { int i, ascii; ascii = 1; for (i = 0; i < 24; i++) { if (buf[i] != 0 && (buf[i] < ' ' || buf[i] > 127)) { ascii = 0; break; } } if (ascii) { printf("'%.24s'", buf); } else { for (i = 0; i < 24; i++) printf("%02x", buf[i]); } } static void g_raid_md_ddf_print(struct ddf_meta *meta) { struct ddf_vdc_record *vdc; struct ddf_vuc_record *vuc; struct ddf_sa_record *sa; uint64_t *val2; uint32_t val; int i, j, k, num, num2; if (g_raid_debug < 1) return; printf("********* DDF Metadata *********\n"); printf("**** Header ****\n"); printf("DDF_Header_GUID "); print_guid(meta->hdr->DDF_Header_GUID); printf("\n"); printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]); printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number)); printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp)); printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag)); printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag)); printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping)); printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA)); printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA)); printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length)); printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA)); printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries)); printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries)); printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions)); printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length)); printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries)); printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length)); printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length)); printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length)); printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length)); printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length)); printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length)); printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length)); printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length)); printf("**** Controller Data ****\n"); printf("Controller_GUID "); print_guid(meta->cdr->Controller_GUID); printf("\n"); printf("Controller_Type 0x%04x%04x 0x%04x%04x\n", GET16(meta, cdr->Controller_Type.Vendor_ID), GET16(meta, cdr->Controller_Type.Device_ID), GET16(meta, cdr->Controller_Type.SubVendor_ID), GET16(meta, cdr->Controller_Type.SubDevice_ID)); printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]); printf("**** Physical Disk Records ****\n"); printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs)); printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported)); for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) { if (isff(meta->pdr->entry[j].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff) continue; printf("PD_GUID "); print_guid(meta->pdr->entry[j].PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdr->entry[j].PD_Reference)); printf("PD_Type 0x%04x\n", GET16(meta, pdr->entry[j].PD_Type)); printf("PD_State 0x%04x\n", GET16(meta, pdr->entry[j].PD_State)); printf("Configured_Size %ju\n", GET64(meta, pdr->entry[j].Configured_Size)); printf("Block_Size %u\n", GET16(meta, pdr->entry[j].Block_Size)); } printf("**** Virtual Disk Records ****\n"); printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs)); printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported)); for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) { if (isff(meta->vdr->entry[j].VD_GUID, 24)) continue; printf("VD_GUID "); print_guid(meta->vdr->entry[j].VD_GUID); printf("\n"); printf("VD_Number 0x%04x\n", GET16(meta, vdr->entry[j].VD_Number)); printf("VD_Type 0x%04x\n", GET16(meta, vdr->entry[j].VD_Type)); printf("VD_State 0x%02x\n", GET8(meta, vdr->entry[j].VD_State)); printf("Init_State 0x%02x\n", GET8(meta, vdr->entry[j].Init_State)); printf("Drive_Failures_Remaining %u\n", GET8(meta, vdr->entry[j].Drive_Failures_Remaining)); printf("VD_Name '%.16s'\n", (char *)&meta->vdr->entry[j].VD_Name); } printf("**** Configuration Records ****\n"); num = GETCRNUM(meta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(meta, j); val = GET32D(meta, vdc->Signature); switch (val) { case DDF_VDCR_SIGNATURE: printf("** Virtual Disk Configuration **\n"); printf("VD_GUID "); print_guid(vdc->VD_GUID); printf("\n"); printf("Timestamp 0x%08x\n", GET32D(meta, vdc->Timestamp)); printf("Sequence_Number 0x%08x\n", GET32D(meta, vdc->Sequence_Number)); printf("Primary_Element_Count %u\n", GET16D(meta, vdc->Primary_Element_Count)); printf("Stripe_Size %u\n", GET8D(meta, vdc->Stripe_Size)); printf("Primary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Primary_RAID_Level)); printf("RLQ 0x%02x\n", GET8D(meta, vdc->RLQ)); printf("Secondary_Element_Count %u\n", GET8D(meta, vdc->Secondary_Element_Count)); printf("Secondary_Element_Seq %u\n", GET8D(meta, vdc->Secondary_Element_Seq)); printf("Secondary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Secondary_RAID_Level)); printf("Block_Count %ju\n", GET64D(meta, vdc->Block_Count)); printf("VD_Size %ju\n", GET64D(meta, vdc->VD_Size)); printf("Block_Size %u\n", GET16D(meta, vdc->Block_Size)); printf("Rotate_Parity_count %u\n", GET8D(meta, vdc->Rotate_Parity_count)); printf("Associated_Spare_Disks"); for (i = 0; i < 8; i++) { if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff) printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i])); } printf("\n"); printf("Cache_Flags %016jx\n", GET64D(meta, vdc->Cache_Flags)); printf("BG_Rate %u\n", GET8D(meta, vdc->BG_Rate)); printf("MDF_Parity_Disks %u\n", GET8D(meta, vdc->MDF_Parity_Disks)); printf("MDF_Parity_Generator_Polynomial 0x%04x\n", GET16D(meta, vdc->MDF_Parity_Generator_Polynomial)); printf("MDF_Constant_Generation_Method 0x%02x\n", GET8D(meta, vdc->MDF_Constant_Generation_Method)); printf("Physical_Disks "); num2 = GET16D(meta, vdc->Primary_Element_Count); val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]); for (i = 0; i < num2; i++) printf(" 0x%08x @ %ju", GET32D(meta, vdc->Physical_Disk_Sequence[i]), GET64P(meta, val2 + i)); printf("\n"); break; case DDF_VUCR_SIGNATURE: printf("** Vendor Unique Configuration **\n"); vuc = (struct ddf_vuc_record *)vdc; printf("VD_GUID "); print_guid(vuc->VD_GUID); printf("\n"); break; case DDF_SA_SIGNATURE: printf("** Spare Assignment Configuration **\n"); sa = (struct ddf_sa_record *)vdc; printf("Timestamp 0x%08x\n", GET32D(meta, sa->Timestamp)); printf("Spare_Type 0x%02x\n", GET8D(meta, sa->Spare_Type)); printf("Populated_SAEs %u\n", GET16D(meta, sa->Populated_SAEs)); printf("MAX_SAE_Supported %u\n", GET16D(meta, sa->MAX_SAE_Supported)); for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) { if (isff(sa->entry[i].VD_GUID, 24)) continue; printf("VD_GUID "); for (k = 0; k < 24; k++) printf("%02x", sa->entry[i].VD_GUID[k]); printf("\n"); printf("Secondary_Element %u\n", GET16D(meta, sa->entry[i].Secondary_Element)); } break; case 0x00000000: case 0xFFFFFFFF: break; default: printf("Unknown configuration signature %08x\n", val); break; } } printf("**** Physical Disk Data ****\n"); printf("PD_GUID "); print_guid(meta->pdd->PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdd->PD_Reference)); printf("Forced_Ref_Flag 0x%02x\n", GET8(meta, pdd->Forced_Ref_Flag)); printf("Forced_PD_GUID_Flag 0x%02x\n", GET8(meta, pdd->Forced_PD_GUID_Flag)); } static int ddf_meta_find_pd(struct ddf_meta *meta, uint8_t *GUID, uint32_t PD_Reference) { int i; for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (GUID != NULL) { if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0) return (i); } else if (PD_Reference != 0xffffffff) { if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference) return (i); } else if (isff(meta->pdr->entry[i].PD_GUID, 24)) return (i); } if (GUID == NULL && PD_Reference == 0xffffffff) { if (i >= GET16(meta, pdr->Max_PDE_Supported)) return (-1); SET16(meta, pdr->Populated_PDEs, i + 1); return (i); } return (-1); } static int ddf_meta_find_vd(struct ddf_meta *meta, uint8_t *GUID) { int i; for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) { if (GUID != NULL) { if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0) return (i); } else if (isff(meta->vdr->entry[i].VD_GUID, 24)) return (i); } if (GUID == NULL) { if (i >= GET16(meta, vdr->Max_VDE_Supported)) return (-1); SET16(meta, vdr->Populated_VDEs, i + 1); return (i); } return (-1); } static struct ddf_vdc_record * ddf_meta_find_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GUID != NULL) { if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE && memcmp(vdc->VD_GUID, GUID, 24) == 0) return (vdc); } else if (GET32D(meta, vdc->Signature) == 0xffffffff || GET32D(meta, vdc->Signature) == 0) return (vdc); } return (NULL); } static int ddf_meta_count_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num, cnt; cnt = 0; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; if (GUID == NULL || memcmp(vdc->VD_GUID, GUID, 24) == 0) cnt++; } return (cnt); } static int ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference, int *bvdp, int *posp) { int i, bvd, pos; i = 0; for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) { if (vmeta->bvdc[bvd] == NULL) { i += GET16(vmeta, vdc->Primary_Element_Count); // XXX continue; } for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count); pos++, i++) { if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) == PD_Reference) { if (bvdp != NULL) *bvdp = bvd; if (posp != NULL) *posp = pos; return (i); } } } return (-1); } static struct ddf_sa_record * ddf_meta_find_sa(struct ddf_meta *meta, int create) { struct ddf_sa_record *sa; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE) return (sa); } if (create) { for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == 0xffffffff || GET32D(meta, sa->Signature) == 0) return (sa); } } return (NULL); } static void ddf_meta_create(struct g_raid_disk *disk, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct ddf_meta *meta; struct ddf_pd_entry *pde; off_t anchorlba; u_int ss, pos, size; int len, error; char serial_buffer[24]; if (sample->hdr == NULL) sample = NULL; mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; ss = disk->d_consumer->provider->sectorsize; anchorlba = disk->d_consumer->provider->mediasize / ss - 1; meta->sectorsize = ss; meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian; getnanotime(&ts); clock_ts_to_ct(&ts, &ct); /* Header */ meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memset(meta->hdr, 0xff, ss); if (sample) { memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header)); if (ss != sample->sectorsize) { SET32(meta, hdr->WorkSpace_Length, howmany(GET32(sample, hdr->WorkSpace_Length) * sample->sectorsize, ss)); SET16(meta, hdr->Configuration_Record_Length, howmany(GET16(sample, hdr->Configuration_Record_Length) * sample->sectorsize, ss)); SET32(meta, hdr->cd_length, howmany(GET32(sample, hdr->cd_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdr_length, howmany(GET32(sample, hdr->pdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->vdr_length, howmany(GET32(sample, hdr->vdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->cr_length, howmany(GET32(sample, hdr->cr_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdd_length, howmany(GET32(sample, hdr->pdd_length) * sample->sectorsize, ss)); SET32(meta, hdr->bbmlog_length, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Diagnostic_Space, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Vendor_Specific_Logs, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); } } else { SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE); snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x", (u_int)(ts.tv_sec - DECADE), arc4random()); memcpy(meta->hdr->DDF_rev, "02.00.00", 8); SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE)); SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss); SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1); SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS); SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS); SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS); SET16(meta, hdr->Configuration_Record_Length, howmany(sizeof(struct ddf_vdc_record) + (4 + 8) * GET16(meta, hdr->Max_Primary_Element_Entries), ss)); SET32(meta, hdr->cd_length, howmany(sizeof(struct ddf_cd_record), ss)); SET32(meta, hdr->pdr_length, howmany(sizeof(struct ddf_pd_record) + sizeof(struct ddf_pd_entry) * GET16(meta, hdr->Max_PD_Entries), ss)); SET32(meta, hdr->vdr_length, howmany(sizeof(struct ddf_vd_record) + sizeof(struct ddf_vd_entry) * GET16(meta, hdr->Max_VD_Entries), ss)); SET32(meta, hdr->cr_length, GET16(meta, hdr->Configuration_Record_Length) * (GET16(meta, hdr->Max_Partitions) + 1)); SET32(meta, hdr->pdd_length, howmany(sizeof(struct ddf_pdd_record), ss)); SET32(meta, hdr->bbmlog_length, 0); SET32(meta, hdr->Diagnostic_Space_Length, 0); SET32(meta, hdr->Vendor_Specific_Logs_Length, 0); } pos = 1; SET32(meta, hdr->cd_section, pos); pos += GET32(meta, hdr->cd_length); SET32(meta, hdr->pdr_section, pos); pos += GET32(meta, hdr->pdr_length); SET32(meta, hdr->vdr_section, pos); pos += GET32(meta, hdr->vdr_length); SET32(meta, hdr->cr_section, pos); pos += GET32(meta, hdr->cr_length); SET32(meta, hdr->pdd_section, pos); pos += GET32(meta, hdr->pdd_length); SET32(meta, hdr->bbmlog_section, GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->bbmlog_length); SET32(meta, hdr->Diagnostic_Space, GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->Diagnostic_Space_Length); SET32(meta, hdr->Vendor_Specific_Logs, GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff); pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1); SET64(meta, hdr->Primary_Header_LBA, anchorlba - pos); SET64(meta, hdr->Secondary_Header_LBA, 0xffffffffffffffffULL); SET64(meta, hdr->WorkSpace_LBA, anchorlba + 1 - 32 * 1024 * 1024 / ss); /* Controller Data */ size = GET32(meta, hdr->cd_length) * ss; meta->cdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cdr, 0xff, size); SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE); memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24); memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16); /* Physical Drive Records. */ size = GET32(meta, hdr->pdr_length) * ss; meta->pdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdr, 0xff, size); SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE); SET16(meta, pdr->Populated_PDEs, 1); SET16(meta, pdr->Max_PDE_Supported, GET16(meta, hdr->Max_PD_Entries)); pde = &meta->pdr->entry[0]; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer); if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20) snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer); else snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xffff); SET32D(meta, pde->PD_Reference, arc4random()); SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE); SET16D(meta, pde->PD_State, 0); SET64D(meta, pde->Configured_Size, anchorlba + 1 - 32 * 1024 * 1024 / ss); SET16D(meta, pde->Block_Size, ss); /* Virtual Drive Records. */ size = GET32(meta, hdr->vdr_length) * ss; meta->vdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdr, 0xff, size); SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE); SET32(meta, vdr->Populated_VDEs, 0); SET16(meta, vdr->Max_VDE_Supported, GET16(meta, hdr->Max_VD_Entries)); /* Configuration Records. */ size = GET32(meta, hdr->cr_length) * ss; meta->cr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cr, 0xff, size); /* Physical Disk Data. */ size = GET32(meta, hdr->pdd_length) * ss; meta->pdd = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdd, 0xff, size); SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE); memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24); SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference)); SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF); SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID); /* Bad Block Management Log. */ if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; meta->bbm = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->bbm, 0xff, size); SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE); SET32(meta, bbm->Entry_Count, 0); SET32(meta, bbm->Spare_Block_Count, 0); } } static void ddf_meta_copy(struct ddf_meta *dst, struct ddf_meta *src) { struct ddf_header *hdr; u_int ss; hdr = src->hdr; dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss); dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss); dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss); dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss); if (src->bbm != NULL) { dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss); } } static void ddf_meta_update(struct ddf_meta *meta, struct ddf_meta *src) { struct ddf_pd_entry *pde, *spde; int i, j; for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) { spde = &src->pdr->entry[i]; if (isff(spde->PD_GUID, 24)) continue; j = ddf_meta_find_pd(meta, NULL, GET32(src, pdr->entry[i].PD_Reference)); if (j < 0) { j = ddf_meta_find_pd(meta, NULL, 0xffffffff); pde = &meta->pdr->entry[j]; memcpy(pde, spde, sizeof(*pde)); } else { pde = &meta->pdr->entry[j]; SET16D(meta, pde->PD_State, GET16D(meta, pde->PD_State) | GET16D(src, pde->PD_State)); } } } static void ddf_meta_free(struct ddf_meta *meta) { if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->pdr != NULL) { free(meta->pdr, M_MD_DDF); meta->pdr = NULL; } if (meta->vdr != NULL) { free(meta->vdr, M_MD_DDF); meta->vdr = NULL; } if (meta->cr != NULL) { free(meta->cr, M_MD_DDF); meta->cr = NULL; } if (meta->pdd != NULL) { free(meta->pdd, M_MD_DDF); meta->pdd = NULL; } if (meta->bbm != NULL) { free(meta->bbm, M_MD_DDF); meta->bbm = NULL; } } static void ddf_vol_meta_create(struct ddf_vol_meta *meta, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct ddf_header *hdr; u_int ss, size; hdr = sample->hdr; meta->bigendian = sample->bigendian; ss = meta->sectorsize = sample->sectorsize; meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, sample->hdr, ss); meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss); meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry)); getnanotime(&ts); clock_ts_to_ct(&ts, &ct); snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xf); size = GET16(sample, hdr->Configuration_Record_Length) * ss; meta->vdc = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdc, 0xff, size); SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE); memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24); SET32(meta, vdc->Sequence_Number, 0); } static void ddf_vol_meta_update(struct ddf_vol_meta *dst, struct ddf_meta *src, uint8_t *GUID, int started) { struct ddf_header *hdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; int vnew, bvnew, bvd, size; u_int ss; hdr = src->hdr; vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)]; vdc = ddf_meta_find_vdc(src, GUID); if (GET8D(src, vdc->Secondary_Element_Count) == 1) bvd = 0; else bvd = GET8D(src, vdc->Secondary_Element_Seq); size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize; if (dst->vdc == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, vdc->Sequence_Number))) > 0)) vnew = 1; else vnew = 0; if (dst->bvdc[bvd] == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, bvdc[bvd]->Sequence_Number))) > 0)) bvnew = 1; else bvnew = 0; if (vnew) { dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; if (dst->hdr != NULL) free(dst->hdr, M_MD_DDF); dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); if (dst->cdr != NULL) free(dst->cdr, M_MD_DDF); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); if (dst->vde != NULL) free(dst->vde, M_MD_DDF); dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry)); if (dst->vdc != NULL) free(dst->vdc, M_MD_DDF); dst->vdc = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->vdc, vdc, size); } if (bvnew) { if (dst->bvdc[bvd] != NULL) free(dst->bvdc[bvd], M_MD_DDF); dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->bvdc[bvd], vdc, size); } } static void ddf_vol_meta_free(struct ddf_vol_meta *meta) { int i; if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->vde != NULL) { free(meta->vde, M_MD_DDF); meta->vde = NULL; } if (meta->vdc != NULL) { free(meta->vdc, M_MD_DDF); meta->vdc = NULL; } for (i = 0; i < DDF_MAX_DISKS_HARD; i++) { if (meta->bvdc[i] != NULL) { free(meta->bvdc[i], M_MD_DDF); meta->bvdc[i] = NULL; } } } static int ddf_meta_unused_range(struct ddf_meta *meta, off_t *off, off_t *size) { struct ddf_vdc_record *vdc; off_t beg[32], end[32], beg1, end1; uint64_t *offp; int i, j, n, num, pos; uint32_t ref; *off = 0; *size = 0; ref = GET32(meta, pdd->PD_Reference); pos = ddf_meta_find_pd(meta, NULL, ref); beg[0] = 0; end[0] = GET64(meta, pdr->entry[pos].Configured_Size); n = 1; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++) if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref) break; if (pos == GET16D(meta, vdc->Primary_Element_Count)) continue; offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[ GET16(meta, hdr->Max_Primary_Element_Entries)]); beg1 = GET64P(meta, offp + pos); end1 = beg1 + GET64D(meta, vdc->Block_Count); for (j = 0; j < n; j++) { if (beg[j] >= end1 || end[j] <= beg1 ) continue; if (beg[j] < beg1 && end[j] > end1) { beg[n] = end1; end[n] = end[j]; end[j] = beg1; n++; } else if (beg[j] < beg1) end[j] = beg1; else beg[j] = end1; } } for (j = 0; j < n; j++) { if (end[j] - beg[j] > *size) { *off = beg[j]; *size = end[j] - beg[j]; } } return ((*size > 0) ? 1 : 0); } static void ddf_meta_get_name(struct ddf_meta *meta, int num, char *buf) { const char *b; int i; b = meta->vdr->entry[num].VD_Name; for (i = 15; i >= 0; i--) if (b[i] != 0x20) break; memcpy(buf, b, i + 1); buf[i + 1] = 0; } static void ddf_meta_put_name(struct ddf_vol_meta *meta, char *buf) { int len; len = min(strlen(buf), 16); memset(meta->vde->VD_Name, 0x20, 16); memcpy(meta->vde->VD_Name, buf, len); } static int ddf_meta_read(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_header *ahdr, *hdr; char *abuf, *buf; off_t plba, slba, lba; int error, len, i; u_int ss; uint32_t val; ddf_meta_free(meta); pp = cp->provider; ss = meta->sectorsize = pp->sectorsize; /* Read anchor block. */ abuf = g_read_data(cp, pp->mediasize - ss, ss, &error); if (abuf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (error); } ahdr = (struct ddf_header *)abuf; /* Check if this is an DDF RAID struct */ if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 1; else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 0; else { G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name); error = EINVAL; goto done; } if (ahdr->Header_Type != DDF_HEADER_ANCHOR) { G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name); error = EINVAL; goto done; } meta->hdr = ahdr; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); meta->hdr = NULL; if (crc32(ahdr, ss) != val) { G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name); error = EINVAL; goto done; } if ((plba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } if (slba != -1 && (slba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } lba = plba; doread: error = 0; ddf_meta_free(meta); /* Read header block. */ buf = g_read_data(cp, lba * ss, ss, &error); if (buf == NULL) { readerror: G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).", (lba == plba) ? "primary" : "secondary", pp->name, error); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name); goto done; } meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, buf, ss); g_free(buf); hdr = meta->hdr; val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); if (hdr->Signature != ahdr->Signature || crc32(meta->hdr, ss) != val || memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) || GET64(meta, hdr->Primary_Header_LBA) != plba || GET64(meta, hdr->Secondary_Header_LBA) != slba) { hdrerror: G_RAID_DEBUG(1, "DDF %s metadata check failed on %s", (lba == plba) ? "primary" : "secondary", pp->name); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name); error = EINVAL; goto done; } if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) || (lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY)) goto hdrerror; len = 1; len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length)); len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length)); len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length)); len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length)); len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length)); if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->bbmlog_length)); if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length)); if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length)); if ((plba + len) * ss >= pp->mediasize) goto hdrerror; if (slba != -1 && (slba + len) * ss >= pp->mediasize) goto hdrerror; /* Workaround for Adaptec implementation. */ if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) { SET16(meta, hdr->Max_Primary_Element_Entries, min(GET16(meta, hdr->Max_PD_Entries), (GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12)); } if (GET32(meta, hdr->cd_length) * ss >= MAXPHYS || GET32(meta, hdr->pdr_length) * ss >= MAXPHYS || GET32(meta, hdr->vdr_length) * ss >= MAXPHYS || GET32(meta, hdr->cr_length) * ss >= MAXPHYS || GET32(meta, hdr->pdd_length) * ss >= MAXPHYS || GET32(meta, hdr->bbmlog_length) * ss >= MAXPHYS) { G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name); goto hdrerror; } /* Read controller data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, GET32(meta, hdr->cd_length) * ss, &error); if (buf == NULL) goto readerror; meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss); g_free(buf); if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE) goto hdrerror; /* Read physical disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, GET32(meta, hdr->pdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss); g_free(buf); if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE) goto hdrerror; /* * Workaround for reading metadata corrupted due to graid bug. * XXX: Remove this before we have disks above 128PB. :) */ if (meta->bigendian) { for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (isff(meta->pdr->entry[i].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[i].PD_Reference) == 0xffffffff) continue; if (GET64(meta, pdr->entry[i].Configured_Size) >= (1ULL << 48)) { SET16(meta, pdr->entry[i].PD_State, GET16(meta, pdr->entry[i].PD_State) & ~DDF_PDE_FAILED); SET64(meta, pdr->entry[i].Configured_Size, GET64(meta, pdr->entry[i].Configured_Size) & ((1ULL << 48) - 1)); } } } /* Read virtual disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, GET32(meta, hdr->vdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss); g_free(buf); if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE) goto hdrerror; /* Read configuration records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, GET32(meta, hdr->cr_length) * ss, &error); if (buf == NULL) goto readerror; meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss); g_free(buf); /* Read physical disk data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, GET32(meta, hdr->pdd_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss); g_free(buf); if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE) goto hdrerror; i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference)); if (i < 0) goto hdrerror; /* Read BBM Log. */ if (GET32(meta, hdr->bbmlog_section) != 0xffffffff && GET32(meta, hdr->bbmlog_length) != 0) { buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, GET32(meta, hdr->bbmlog_length) * ss, &error); if (buf == NULL) goto readerror; meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss); g_free(buf); if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE) goto hdrerror; } done: g_free(abuf); if (error != 0) ddf_meta_free(meta); return (error); } static int ddf_meta_write(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_vdc_record *vdc; off_t alba, plba, slba, lba; u_int ss, size; int error, i, num; pp = cp->provider; ss = pp->sectorsize; lba = alba = pp->mediasize / ss - 1; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); next: SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR : (lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY); SET32(meta, hdr->CRC, 0xffffffff); SET32(meta, hdr->CRC, crc32(meta->hdr, ss)); error = g_write_data(cp, lba * ss, meta->hdr, ss); if (error != 0) { err: G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); if (lba != alba) goto done; } if (lba == alba) { lba = plba; goto next; } size = GET32(meta, hdr->cd_length) * ss; SET32(meta, cdr->CRC, 0xffffffff); SET32(meta, cdr->CRC, crc32(meta->cdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, meta->cdr, size); if (error != 0) goto err; size = GET32(meta, hdr->pdr_length) * ss; SET32(meta, pdr->CRC, 0xffffffff); SET32(meta, pdr->CRC, crc32(meta->pdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, meta->pdr, size); if (error != 0) goto err; size = GET32(meta, hdr->vdr_length) * ss; SET32(meta, vdr->CRC, 0xffffffff); SET32(meta, vdr->CRC, crc32(meta->vdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, meta->vdr, size); if (error != 0) goto err; size = GET16(meta, hdr->Configuration_Record_Length) * ss; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); SET32D(meta, vdc->CRC, 0xffffffff); SET32D(meta, vdc->CRC, crc32(vdc, size)); } error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, meta->cr, size * num); if (error != 0) goto err; size = GET32(meta, hdr->pdd_length) * ss; SET32(meta, pdd->CRC, 0xffffffff); SET32(meta, pdd->CRC, crc32(meta->pdd, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, meta->pdd, size); if (error != 0) goto err; if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; SET32(meta, bbm->CRC, 0xffffffff); SET32(meta, bbm->CRC, crc32(meta->bbm, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, meta->bbm, size); if (error != 0) goto err; } done: if (lba == plba && slba != -1) { lba = slba; goto next; } return (error); } static int ddf_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_DDF); return (error); } static struct g_raid_volume * g_raid_md_ddf_get_volume(struct g_raid_softc *sc, uint8_t *GUID) { struct g_raid_volume *vol; struct g_raid_md_ddf_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0) break; } return (vol); } static struct g_raid_disk * g_raid_md_ddf_get_disk(struct g_raid_softc *sc, uint8_t *GUID, uint32_t id) { struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct ddf_meta *meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; if (GUID != NULL) { if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0) break; } else { if (GET32(meta, pdd->PD_Reference) == id) break; } } return (disk); } static int g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_ddf_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_ddf_purge_disks(struct g_raid_softc *sc) { #if 0 struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_ddf_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_ddf_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_DDF); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); #endif return (0); } static int g_raid_md_ddf_supported(int level, int qual, int disks, int force) { if (disks > DDF_MAX_DISKS_HARD) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (qual == G_RAID_VOLUME_RLQ_R1SM) { if (!force && disks != 2) return (0); } else if (qual == G_RAID_VOLUME_RLQ_R1MM) { if (!force && disks != 3) return (0); } else return (0); break; case G_RAID_VOLUME_RL_RAID3: if (qual != G_RAID_VOLUME_RLQ_R3P0 && qual != G_RAID_VOLUME_RLQ_R3PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID4: if (qual != G_RAID_VOLUME_RLQ_R4P0 && qual != G_RAID_VOLUME_RLQ_R4PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (qual != G_RAID_VOLUME_RLQ_R5RA && qual != G_RAID_VOLUME_RLQ_R5RS && qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID6: if (qual != G_RAID_VOLUME_RLQ_R6RA && qual != G_RAID_VOLUME_RLQ_R6RS && qual != G_RAID_VOLUME_RLQ_R6LA && qual != G_RAID_VOLUME_RLQ_R6LS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAIDMDF: if (qual != G_RAID_VOLUME_RLQ_RMDFRA && qual != G_RAID_VOLUME_RLQ_RMDFRS && qual != G_RAID_VOLUME_RLQ_RMDFLA && qual != G_RAID_VOLUME_RLQ_RMDFLS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (qual != G_RAID_VOLUME_RLQ_R1EA && qual != G_RAID_VOLUME_RLQ_R1EO) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5E: if (qual != G_RAID_VOLUME_RLQ_R5ERA && qual != G_RAID_VOLUME_RLQ_R5ERS && qual != G_RAID_VOLUME_RLQ_R5ELA && qual != G_RAID_VOLUME_RLQ_R5ELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5EE: if (qual != G_RAID_VOLUME_RLQ_R5EERA && qual != G_RAID_VOLUME_RLQ_R5EERS && qual != G_RAID_VOLUME_RLQ_R5EELA && qual != G_RAID_VOLUME_RLQ_R5EELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5R: if (qual != G_RAID_VOLUME_RLQ_R5RRA && qual != G_RAID_VOLUME_RLQ_R5RRS && qual != G_RAID_VOLUME_RLQ_R5RLA && qual != G_RAID_VOLUME_RLQ_R5RLS) return (0); if (disks < 3) return (0); break; default: return (0); } return (1); } static int g_raid_md_ddf_start_disk(struct g_raid_disk *disk, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_meta *pdmeta, *gmeta; struct ddf_vdc_record *vdc1; struct ddf_sa_record *sa; off_t size, eoff = 0, esize = 0; uint64_t *val2; int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos; int i, resurrection = 0; uint32_t reference; sc = disk->d_softc; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; reference = GET32(&pd->pd_meta, pdd->PD_Reference); pv = vol->v_md_data; vmeta = &pv->pv_meta; gmeta = &mdi->mdio_meta; /* Find disk position in metadata by its reference. */ disk_pos = ddf_meta_find_disk(vmeta, reference, &md_disk_bvd, &md_disk_pos); md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not a present part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If disk has some metadata for this volume - erase. */ if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) SET32D(pdmeta, vdc1->Signature, 0xffffffff); /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { G_RAID_DEBUG1(1, sc, "No free partitions on disk %s", g_raid_get_diskname(disk)); goto nofit; } ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } eoff *= pd->pd_meta.sectorsize; esize *= pd->pd_meta.sectorsize; size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && esize < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), esize, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size; md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX } else { nofit: if (disk->d_state == G_RAID_DISK_S_NONE) g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } /* * If spare is committable, delete spare record. * Othersize, mark it active and leave there. */ sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa != NULL) { if ((GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_REVERTIBLE) == 0) { SET32D(&pd->pd_meta, sa->Signature, 0xffffffff); } else { SET8D(&pd->pd_meta, sa->Spare_Type, GET8D(&pd->pd_meta, sa->Spare_Type) | DDF_SAR_TYPE_ACTIVE); } } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = eoff; sd->sd_size = esize; } else if (pdmeta->cr != NULL && (vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) { val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512; sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & (DDF_PDE_FAILED | DDF_PDE_REBUILD)) != 0) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = 0; } else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 || (GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) != DDF_VDE_INIT_FULL) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_ddf_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) < GET16(&pd->pd_meta, hdr->Max_Partitions)) { update = g_raid_md_ddf_start_disk(disk, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_ddf(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_ddf_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; uint64_t *val2; int i, j, bvd; sc = vol->v_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pv = vol->v_md_data; vmeta = &pv->pv_meta; vdc = vmeta->vdc; vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level); vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ); if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 && GET8(vmeta, vdc->Secondary_RAID_Level) == 0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_sectorsize = GET16(vmeta, vdc->Block_Size); if (vol->v_sectorsize == 0xffff) vol->v_sectorsize = vmeta->sectorsize; vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size); vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) * GET8(vmeta, vdc->Secondary_Element_Count); vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks); vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial); vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method); if (GET8(vmeta, vdc->Rotate_Parity_count) > 31) vol->v_rotate_parity = 1; else vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count); vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize; for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) { if (j == GET16(vmeta, vdc->Primary_Element_Count)) { j = 0; bvd++; } sd = &vol->v_subdisks[i]; if (vmeta->bvdc[bvd] == NULL) { sd->sd_offset = 0; sd->sd_size = GET64(vmeta, vdc->Block_Count) * vol->v_sectorsize; continue; } val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize; sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) * vol->v_sectorsize; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL) g_raid_md_ddf_start_disk(disk, vol); } pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_ddf_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_ddf_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_ddf_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct g_raid_volume *vol; struct ddf_meta *pdmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_vd_entry *vde; int i, j, k, num, have, need, cnt, spare; uint32_t val; char buf[17]; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; spare = -1; if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, pdmeta); else ddf_meta_update(&mdi->mdio_meta, pdmeta); num = GETCRNUM(pdmeta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(pdmeta, j); val = GET32D(pdmeta, vdc->Signature); if (val == DDF_SA_SIGNATURE && spare == -1) spare = 1; if (val != DDF_VDCR_SIGNATURE) continue; spare = 0; k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID); if (k < 0) continue; vde = &pdmeta->vdr->entry[k]; /* Look for volume with matching ID. */ vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID); if (vol == NULL) { ddf_meta_get_name(pdmeta, k, buf); vol = g_raid_create_volume(sc, buf, GET16D(pdmeta, vde->VD_Number)); pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_ddf_go, vol); mdi->mdio_starting++; } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ vmeta = &pv->pv_meta; ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started); } if (spare == 1) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_ddf_refill(sc); } TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; vmeta = &pv->pv_meta; if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL) continue; if (pv->pv_started) { if (g_raid_md_ddf_start_disk(disk, vol)) g_raid_md_write_ddf(md, vol, NULL, NULL); continue; } /* If we collected all needed disks - start array. */ need = 0; have = 0; for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) { if (vmeta->bvdc[k] == NULL) { need += GET16(vmeta, vdc->Primary_Element_Count); continue; } cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count); need += cnt; for (i = 0; i < cnt; i++) { val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]); if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL) have++; } } G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks", vol->v_name, have, need); if (have == need) g_raid_md_ddf_start(vol); } } static int g_raid_md_create_req_ddf(struct g_raid_md_object *md, struct g_class *mp, struct gctl_req *req, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; struct g_raid_md_ddf_object *mdi, *mdi1; char name[16]; const char *fmtopt; int be = 1; mdi = (struct g_raid_md_ddf_object *)md; fmtopt = gctl_get_asciiparam(req, "fmtopt"); if (fmtopt == NULL || strcasecmp(fmtopt, "BE") == 0) be = 1; else if (strcasecmp(fmtopt, "LE") == 0) be = 0; else { gctl_error(req, "Incorrect fmtopt argument."); return (G_RAID_MD_TASTE_FAIL); } /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi1->mdio_bigendian != be) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_ddf(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct ddf_meta meta; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct g_geom *geom; int error, result, be; char name[16]; G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name); mdi = (struct g_raid_md_ddf_object *)md; pp = cp->provider; /* Read metadata from device. */ g_topology_unlock(); bzero(&meta, sizeof(meta)); error = ddf_meta_read(cp, &meta); g_topology_lock(); if (error != 0) return (G_RAID_MD_TASTE_FAIL); be = meta.bigendian; /* Metadata valid. Print it. */ g_raid_md_ddf_print(&meta); /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi->mdio_bigendian != be) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); pd->pd_meta = meta; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_ddf_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_ddf_pervolume *pv; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_ddf_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_ddf(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[DDF_MAX_DISKS_HARD]; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_sa_record *sa; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize; intmax_t *sizearg, *striparg; int i, numdisks, len, level, qual; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_ddf_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { gctl_error(req, "No free partitions " "on disk '%s'.", diskname); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; ddf_meta_unused_range(&pd->pd_meta, &offs[i], &esize); offs[i] *= pp->sectorsize; size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; ddf_meta_create(disk, &mdi->mdio_meta); if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, GET64(&pd->pd_meta, pdr->entry[0].Configured_Size) * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta); pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID4 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else if (level == G_RAID_VOLUME_RL_RAID5R) { vol->v_mediasize = size * (numdisks - 1); vol->v_rotate_parity = 1024; } else if (level == G_RAID_VOLUME_RL_RAID6 || level == G_RAID_VOLUME_RL_RAID5E || level == G_RAID_VOLUME_RL_RAID5EE) vol->v_mediasize = size * (numdisks - 2); else if (level == G_RAID_VOLUME_RL_RAIDMDF) { if (numdisks < 5) vol->v_mdf_pdisks = 2; else vol->v_mdf_pdisks = 3; vol->v_mdf_polynomial = 0x11d; vol->v_mdf_method = 0x00; vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks); } else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = offs[i]; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_ddf_purge_disks(sc); g_raid_md_write_ddf(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_ddf(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); ddf_meta_create(disk, &mdi->mdio_meta); sa = ddf_meta_find_sa(&pd->pd_meta, 1); if (sa != NULL) { SET32D(&pd->pd_meta, sa->Signature, DDF_SA_SIGNATURE); SET8D(&pd->pd_meta, sa->Spare_Type, 0); SET16D(&pd->pd_meta, sa->Populated_SAEs, 0); SET16D(&pd->pd_meta, sa->MAX_SAE_Supported, (GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize - sizeof(struct ddf_sa_record)) / sizeof(struct ddf_sa_entry)); } if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_raid_md_write_ddf(md, NULL, NULL, NULL); g_raid_md_ddf_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_ddf(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_meta *gmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_sa_record *sa; uint64_t *val2; int i, j, pos, bvd, size; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; gmeta = &mdi->mdio_meta; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* * Clear disk flags to let only really needed ones to be reset. * Do it only if there are no volumes in starting state now, * as they can update disk statuses yet and we may kill innocent. */ if (mdi->mdio_starting == 0) { for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) & ~(DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)); if ((GET16(gmeta, pdr->entry[i].PD_State) & DDF_PDE_PFA) == 0) SET16(gmeta, pdr->entry[i].PD_State, 0); } } /* Generate/update new per-volume metadata. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; if (vol->v_stopping || !pv->pv_started) continue; vmeta = &pv->pv_meta; SET32(vmeta, vdc->Sequence_Number, GET32(vmeta, vdc->Sequence_Number) + 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) SET16(vmeta, vdc->Primary_Element_Count, 2); else SET16(vmeta, vdc->Primary_Element_Count, vol->v_disks_count); SET8(vmeta, vdc->Stripe_Size, ffs(vol->v_strip_size / vol->v_sectorsize) - 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) { SET8(vmeta, vdc->Primary_RAID_Level, DDF_VDCR_RAID1); SET8(vmeta, vdc->RLQ, 0); SET8(vmeta, vdc->Secondary_Element_Count, vol->v_disks_count / 2); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } else { SET8(vmeta, vdc->Primary_RAID_Level, vol->v_raid_level); SET8(vmeta, vdc->RLQ, vol->v_raid_level_qualifier); SET8(vmeta, vdc->Secondary_Element_Count, 1); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } SET8(vmeta, vdc->Secondary_Element_Seq, 0); SET64(vmeta, vdc->Block_Count, 0); SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize); SET16(vmeta, vdc->Block_Size, vol->v_sectorsize); SET8(vmeta, vdc->Rotate_Parity_count, fls(vol->v_rotate_parity) - 1); SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks); SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial, vol->v_mdf_polynomial); SET8(vmeta, vdc->MDF_Constant_Generation_Method, vol->v_mdf_method); SET16(vmeta, vde->VD_Number, vol->v_global_id); if (vol->v_state <= G_RAID_VOLUME_S_BROKEN) SET8(vmeta, vde->VD_State, DDF_VDE_FAILED); else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED); else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL) SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL); else SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL); if (vol->v_dirty || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0) SET8(vmeta, vde->VD_State, GET8(vmeta, vde->VD_State) | DDF_VDE_DIRTY); SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX ddf_meta_put_name(vmeta, vol->v_name); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; bvd = i / GET16(vmeta, vdc->Primary_Element_Count); pos = i % GET16(vmeta, vdc->Primary_Element_Count); disk = sd->sd_disk; if (disk != NULL) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (vmeta->bvdc[bvd] == NULL) { size = GET16(vmeta, hdr->Configuration_Record_Length) * vmeta->sectorsize; vmeta->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memset(vmeta->bvdc[bvd], 0xff, size); } memcpy(vmeta->bvdc[bvd], vmeta->vdc, sizeof(struct ddf_vdc_record)); SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd); SET64(vmeta, bvdc[bvd]->Block_Count, sd->sd_size / vol->v_sectorsize); SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos], GET32(&pd->pd_meta, pdd->PD_Reference)); val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); SET64P(vmeta, val2 + pos, sd->sd_offset / vol->v_sectorsize); } if (vmeta->bvdc[bvd] == NULL) continue; j = ddf_meta_find_pd(gmeta, NULL, GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos])); if (j < 0) continue; SET16(gmeta, pdr->entry[j].PD_Type, GET16(gmeta, pdr->entry[j].PD_Type) | DDF_PDE_PARTICIPATING); if (sd->sd_state == G_RAID_SUBDISK_S_NONE) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_MISSING)); else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_REBUILD); else SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_ONLINE); } } /* Mark spare and failed disks as such. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; i = ddf_meta_find_pd(gmeta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); if (i < 0) continue; if (disk->d_state == G_RAID_DISK_S_FAILED) { SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); } if (disk->d_state != G_RAID_DISK_S_SPARE) continue; sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa == NULL || (GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_DEDICATED) == 0) { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_GLOBAL_SPARE); } else { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_CONFIG_SPARE); } SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | DDF_PDE_ONLINE); } /* Remove disks without "participating" flag (unused). */ for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; if ((GET16(gmeta, pdr->entry[i].PD_Type) & (DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)) != 0 || g_raid_md_ddf_get_disk(sc, NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL) j = i; else memset(&gmeta->pdr->entry[i], 0xff, sizeof(struct ddf_pd_entry)); } SET16(gmeta, pdr->Populated_PDEs, j + 1); /* Update per-disk metadata and write them. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; /* Update PDR. */ memcpy(pd->pd_meta.pdr, gmeta->pdr, GET32(&pd->pd_meta, hdr->pdr_length) * pd->pd_meta.sectorsize); /* Update VDR. */ SET16(&pd->pd_meta, vdr->Populated_VDEs, 0); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; i = ddf_meta_find_vd(&pd->pd_meta, pv->pv_meta.vde->VD_GUID); if (i < 0) i = ddf_meta_find_vd(&pd->pd_meta, NULL); if (i >= 0) memcpy(&pd->pd_meta.vdr->entry[i], pv->pv_meta.vde, sizeof(struct ddf_vd_entry)); } /* Update VDC. */ if (mdi->mdio_starting == 0) { /* Remove all VDCs to restore needed later. */ j = GETCRNUM(&pd->pd_meta); for (i = 0; i < j; i++) { vdc = GETVDCPTR(&pd->pd_meta, i); if (GET32D(&pd->pd_meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff); } } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { vol = sd->sd_volume; if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; vmeta = &pv->pv_meta; vdc = ddf_meta_find_vdc(&pd->pd_meta, vmeta->vde->VD_GUID); if (vdc == NULL) vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL); if (vdc != NULL) { bvd = sd->sd_pos / GET16(vmeta, vdc->Primary_Element_Count); memcpy(vdc, vmeta->bvdc[bvd], GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize); } } G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(disk)); g_raid_md_ddf_print(&pd->pd_meta); ddf_meta_write(disk->d_consumer, &pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_ddf(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_ddf_perdisk *pd; struct g_raid_subdisk *sd; int i; sc = md->mdo_softc; pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(tdisk)); i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED | DDF_PDE_PFA); if (tdisk->d_consumer != NULL) ddf_meta_write(tdisk->d_consumer, &pd->pd_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, tdisk); g_raid_md_ddf_refill(sc); return (0); } static int g_raid_md_free_disk_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_ddf_perdisk *pd; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; ddf_meta_free(&pd->pd_meta); free(pd, M_MD_DDF); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_ddf_object *mdi; struct g_raid_md_ddf_pervolume *pv; mdi = (struct g_raid_md_ddf_object *)md; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; ddf_vol_meta_free(&pv->pv_meta); if (!pv->pv_started) { pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); } free(pv, M_MD_DDF); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_ddf(struct g_raid_md_object *md) { struct g_raid_md_ddf_object *mdi; mdi = (struct g_raid_md_ddf_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } ddf_meta_free(&mdi->mdio_meta); return (0); } G_RAID_MD_DECLARE(ddf, "DDF"); Index: head/sys/geom/raid/md_ddf.h =================================================================== --- head/sys/geom/raid/md_ddf.h (revision 326269) +++ head/sys/geom/raid/md_ddf.h (revision 326270) @@ -1,345 +1,347 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Alexander Motin * Copyright (c) 2008 Scott Long * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef MD_DDF_H #define MD_DDF_H /* Definitions from the SNIA DDF spec, rev 1.2/2.0 */ #define DDF_HEADER_LENGTH 512 struct ddf_header { uint32_t Signature; #define DDF_HEADER_SIGNATURE 0xde11de11 uint32_t CRC; uint8_t DDF_Header_GUID[24]; uint8_t DDF_rev[8]; uint32_t Sequence_Number; uint32_t TimeStamp; uint8_t Open_Flag; #define DDF_HEADER_CLOSED 0x00 #define DDF_HEADER_OPENED_MASK 0x0f #define DDF_HEADER_OPEN_ANCHOR 0xff uint8_t Foreign_Flag; uint8_t Diskgrouping; uint8_t pad1[13]; uint8_t Header_ext[32]; uint64_t Primary_Header_LBA; uint64_t Secondary_Header_LBA; uint8_t Header_Type; #define DDF_HEADER_ANCHOR 0x00 #define DDF_HEADER_PRIMARY 0x01 #define DDF_HEADER_SECONDARY 0x02 uint8_t pad2[3]; uint32_t WorkSpace_Length; uint64_t WorkSpace_LBA; uint16_t Max_PD_Entries; uint16_t Max_VD_Entries; uint16_t Max_Partitions; uint16_t Configuration_Record_Length; uint16_t Max_Primary_Element_Entries; uint32_t Max_Mapped_Block_Entries; /* DDF 2.0 */ uint8_t pad3[50]; uint32_t cd_section; /* Controller_Data_Section */ uint32_t cd_length; /* Controller_Data_Section_Length */ uint32_t pdr_section; /* Physical_Drive_Records_Section */ uint32_t pdr_length; /* Physical_Drive_Records_Length */ uint32_t vdr_section; /* Virtual_Drive_Records_Section */ uint32_t vdr_length; /* Virtual_Drive_Records_Length */ uint32_t cr_section; /* Configuration_Records_Section */ uint32_t cr_length; /* Configuration_Records_Length */ uint32_t pdd_section; /* Physical_Drive_Data_Section */ uint32_t pdd_length; /* Physical_Drive_Data_Length */ uint32_t bbmlog_section; /* BBM_Log_Section */ uint32_t bbmlog_length; /* BBM_Log_Section_Length */ uint32_t Diagnostic_Space; uint32_t Diagnostic_Space_Length; uint32_t Vendor_Specific_Logs; uint32_t Vendor_Specific_Logs_Length; uint8_t pad4[256]; } __packed; struct ddf_cd_record { uint32_t Signature; #define DDF_CONTROLLER_DATA_SIGNATURE 0xad111111 uint32_t CRC; uint8_t Controller_GUID[24]; struct { uint16_t Vendor_ID; uint16_t Device_ID; uint16_t SubVendor_ID; uint16_t SubDevice_ID; } Controller_Type __packed; uint8_t Product_ID[16]; uint8_t pad1[8]; uint8_t Controller_Data[448]; } __packed; struct ddf_device_scsi { uint8_t Lun; uint8_t Id; uint8_t Channel; uint8_t Path_Flags; #define DDF_DEVICE_SCSI_FLAG_BROKEN (1 << 7) } __packed; struct ddf_device_sas { uint64_t Initiator_Path; } __packed; union ddf_pathinfo { struct { struct ddf_device_scsi Path0; struct ddf_device_scsi Path1; uint8_t pad[10]; } __packed scsi; struct { struct ddf_device_sas Path0; struct ddf_device_sas Path1; uint8_t Path0_Flags; uint8_t Path1_Flags; #define DDF_DEVICE_SAS_PHY_ID 0x7f #define DDF_DEVICE_SAS_FLAG_BROKEN (1 << 7) } __packed sas; } __packed; struct ddf_pd_entry { uint8_t PD_GUID[24]; uint32_t PD_Reference; uint16_t PD_Type; #define DDF_PDE_GUID_FORCE (1 << 0) #define DDF_PDE_PARTICIPATING (1 << 1) #define DDF_PDE_GLOBAL_SPARE (1 << 2) #define DDF_PDE_CONFIG_SPARE (1 << 3) #define DDF_PDE_FOREIGN (1 << 4) #define DDF_PDE_LEGACY (1 << 5) #define DDF_PDE_TYPE_MASK (0x0f << 12) #define DDF_PDE_UNKNOWN (0x00 << 12) #define DDF_PDE_SCSI (0x01 << 12) #define DDF_PDE_SAS (0x02 << 12) #define DDF_PDE_SATA (0x03 << 12) #define DDF_PDE_FC (0x04 << 12) uint16_t PD_State; #define DDF_PDE_ONLINE (1 << 0) #define DDF_PDE_FAILED (1 << 1) #define DDF_PDE_REBUILD (1 << 2) #define DDF_PDE_TRANSITION (1 << 3) #define DDF_PDE_PFA (1 << 4) #define DDF_PDE_UNRECOVERED (1 << 5) #define DDF_PDE_MISSING (1 << 6) uint64_t Configured_Size; union ddf_pathinfo Path_Information; uint16_t Block_Size; /* DDF 2.0 */ uint8_t pad1[4]; } __packed; struct ddf_pd_record { uint32_t Signature; #define DDF_PDR_SIGNATURE 0x22222222 uint32_t CRC; uint16_t Populated_PDEs; uint16_t Max_PDE_Supported; uint8_t pad1[52]; struct ddf_pd_entry entry[0]; } __packed; struct ddf_vd_entry { uint8_t VD_GUID[24]; uint16_t VD_Number; uint8_t pad1[2]; uint16_t VD_Type; #define DDF_VDE_SHARED (1 << 0) #define DDF_VDE_ENFORCE_GROUP (1 << 1) #define DDF_VDE_UNICODE_NAME (1 << 2) #define DDF_VDE_OWNER_ID_VALID (1 << 3) uint16_t Controller_GUID_CRC; uint8_t VD_State; #define DDF_VDE_OPTIMAL 0x00 #define DDF_VDE_DEGRADED 0x01 #define DDF_VDE_DELETED 0x02 #define DDF_VDE_MISSING 0x03 #define DDF_VDE_FAILED 0x04 #define DDF_VDE_PARTIAL 0x05 #define DDF_VDE_OFFLINE 0x06 #define DDF_VDE_STATE_MASK 0x07 #define DDF_VDE_MORPH (1 << 3) #define DDF_VDE_DIRTY (1 << 4) uint8_t Init_State; #define DDF_VDE_UNINTIALIZED 0x00 #define DDF_VDE_INIT_QUICK 0x01 #define DDF_VDE_INIT_FULL 0x02 #define DDF_VDE_INIT_MASK 0x03 #define DDF_VDE_UACCESS_RW 0x00 #define DDF_VDE_UACCESS_RO 0x80 #define DDF_VDE_UACCESS_BLOCKED 0xc0 #define DDF_VDE_UACCESS_MASK 0xc0 uint8_t Drive_Failures_Remaining; /* DDF 2.0 */ uint8_t pad2[13]; uint8_t VD_Name[16]; } __packed; struct ddf_vd_record { uint32_t Signature; #define DDF_VD_RECORD_SIGNATURE 0xdddddddd uint32_t CRC; uint16_t Populated_VDEs; uint16_t Max_VDE_Supported; uint8_t pad1[52]; struct ddf_vd_entry entry[0]; } __packed; #define DDF_CR_INVALID 0xffffffff struct ddf_vdc_record { uint32_t Signature; #define DDF_VDCR_SIGNATURE 0xeeeeeeee uint32_t CRC; uint8_t VD_GUID[24]; uint32_t Timestamp; uint32_t Sequence_Number; uint8_t pad1[24]; uint16_t Primary_Element_Count; uint8_t Stripe_Size; uint8_t Primary_RAID_Level; #define DDF_VDCR_RAID0 0x00 #define DDF_VDCR_RAID1 0x01 #define DDF_VDCR_RAID3 0x03 #define DDF_VDCR_RAID4 0x04 #define DDF_VDCR_RAID5 0x05 #define DDF_VDCR_RAID6 0x06 #define DDF_VDCR_RAID1E 0x11 #define DDF_VDCR_SINGLE 0x0f #define DDF_VDCR_CONCAT 0x1f #define DDF_VDCR_RAID5E 0x15 #define DDF_VDCR_RAID5EE 0x25 uint8_t RLQ; uint8_t Secondary_Element_Count; uint8_t Secondary_Element_Seq; uint8_t Secondary_RAID_Level; uint64_t Block_Count; uint64_t VD_Size; uint16_t Block_Size; /* DDF 2.0 */ uint8_t Rotate_Parity_count; /* DDF 2.0 */ uint8_t pad2[5]; uint32_t Associated_Spares[8]; uint64_t Cache_Flags; #define DDF_VDCR_CACHE_WB (1 << 0) #define DDF_VDCR_CACHE_WB_ADAPTIVE (1 << 1) #define DDF_VDCR_CACHE_RA (1 << 2) #define DDF_VDCR_CACHE_RA_ADAPTIVE (1 << 3) #define DDF_VDCR_CACHE_WCACHE_NOBATTERY (1 << 4) #define DDF_VDCR_CACHE_WCACHE_ALLOW (1 << 5) #define DDF_VDCR_CACHE_RCACHE_ALLOW (1 << 6) #define DDF_VDCR_CACHE_VENDOR (1 << 7) uint8_t BG_Rate; uint8_t pad3[3]; uint8_t MDF_Parity_Disks; /* DDF 2.0 */ uint16_t MDF_Parity_Generator_Polynomial; /* DDF 2.0 */ uint8_t pad4; uint8_t MDF_Constant_Generation_Method; /* DDF 2.0 */ uint8_t pad5[47]; uint8_t pad6[192]; uint8_t V0[32]; uint8_t V1[32]; uint8_t V2[16]; uint8_t V3[16]; uint8_t Vendor_Scratch[32]; uint32_t Physical_Disk_Sequence[0]; } __packed; struct ddf_vuc_record { uint32_t Signature; #define DDF_VUCR_SIGNATURE 0x88888888 uint32_t CRC; uint8_t VD_GUID[24]; } __packed; struct ddf_sa_entry { uint8_t VD_GUID[24]; uint16_t Secondary_Element; uint8_t rsrvd2[6]; } __packed; struct ddf_sa_record { uint32_t Signature; #define DDF_SA_SIGNATURE 0x55555555 uint32_t CRC; uint32_t Timestamp; uint8_t pad1[7]; uint8_t Spare_Type; #define DDF_SAR_TYPE_DEDICATED (1 << 0) #define DDF_SAR_TYPE_REVERTIBLE (1 << 1) #define DDF_SAR_TYPE_ACTIVE (1 << 2) #define DDF_SAR_TYPE_ENCL_AFFINITY (1 << 3) uint16_t Populated_SAEs; uint16_t MAX_SAE_Supported; uint8_t pad2[8]; struct ddf_sa_entry entry[0]; } __packed; struct ddf_pdd_record { uint32_t Signature; #define DDF_PDD_SIGNATURE 0x33333333 uint32_t CRC; uint8_t PD_GUID[24]; uint32_t PD_Reference; uint8_t Forced_Ref_Flag; #define DDF_PDD_FORCED_REF 0x01 uint8_t Forced_PD_GUID_Flag; #define DDF_PDD_FORCED_GUID 0x01 uint8_t Vendor_Scratch[32]; uint8_t pad2[442]; } __packed; struct ddf_bbm_entry { uint64_t Defective_Block_Start; uint32_t Spare_Block_Offset; uint16_t Remapped_Count; uint8_t pad[2]; }; struct ddf_bbm_log { uint32_t Signature; #define DDF_BBML_SIGNATURE 0xabadb10c uint32_t CRC; uint32_t Entry_Count; uint32_t Spare_Block_Count; uint8_t pad1[8]; uint64_t First_Spare_LBA; uint64_t Mapped_Block_Entry[0]; } __packed; struct ddf_vendor_log { uint32_t Signature; #define DDF_VENDOR_LOG_SIGNATURE 0x01dbeef0 uint32_t CRC; uint64_t Log_Owner; uint8_t pad1[16]; } __packed; #endif Index: head/sys/geom/raid/md_intel.c =================================================================== --- head/sys/geom/raid/md_intel.c (revision 326269) +++ head/sys/geom/raid/md_intel.c (revision 326270) @@ -1,2714 +1,2716 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata"); struct intel_raid_map { uint32_t offset; uint32_t disk_sectors; uint32_t stripe_count; uint16_t strip_sectors; uint8_t status; #define INTEL_S_READY 0x00 #define INTEL_S_UNINITIALIZED 0x01 #define INTEL_S_DEGRADED 0x02 #define INTEL_S_FAILURE 0x03 uint8_t type; #define INTEL_T_RAID0 0x00 #define INTEL_T_RAID1 0x01 #define INTEL_T_RAID5 0x05 uint8_t total_disks; uint8_t total_domains; uint8_t failed_disk_num; uint8_t ddf; uint32_t offset_hi; uint32_t disk_sectors_hi; uint32_t stripe_count_hi; uint32_t filler_2[4]; uint32_t disk_idx[1]; /* total_disks entries. */ #define INTEL_DI_IDX 0x00ffffff #define INTEL_DI_RBLD 0x01000000 } __packed; struct intel_raid_vol { uint8_t name[16]; u_int64_t total_sectors __packed; uint32_t state; #define INTEL_ST_BOOTABLE 0x00000001 #define INTEL_ST_BOOT_DEVICE 0x00000002 #define INTEL_ST_READ_COALESCING 0x00000004 #define INTEL_ST_WRITE_COALESCING 0x00000008 #define INTEL_ST_LAST_SHUTDOWN_DIRTY 0x00000010 #define INTEL_ST_HIDDEN_AT_BOOT 0x00000020 #define INTEL_ST_CURRENTLY_HIDDEN 0x00000040 #define INTEL_ST_VERIFY_AND_FIX 0x00000080 #define INTEL_ST_MAP_STATE_UNINIT 0x00000100 #define INTEL_ST_NO_AUTO_RECOVERY 0x00000200 #define INTEL_ST_CLONE_N_GO 0x00000400 #define INTEL_ST_CLONE_MAN_SYNC 0x00000800 #define INTEL_ST_CNG_MASTER_DISK_NUM 0x00001000 uint32_t reserved; uint8_t migr_priority; uint8_t num_sub_vols; uint8_t tid; uint8_t cng_master_disk; uint16_t cache_policy; uint8_t cng_state; #define INTEL_CNGST_UPDATED 0 #define INTEL_CNGST_NEEDS_UPDATE 1 #define INTEL_CNGST_MASTER_MISSING 2 uint8_t cng_sub_state; uint32_t filler_0[10]; uint32_t curr_migr_unit; uint32_t checkpoint_id; uint8_t migr_state; uint8_t migr_type; #define INTEL_MT_INIT 0 #define INTEL_MT_REBUILD 1 #define INTEL_MT_VERIFY 2 #define INTEL_MT_GEN_MIGR 3 #define INTEL_MT_STATE_CHANGE 4 #define INTEL_MT_REPAIR 5 uint8_t dirty; uint8_t fs_state; uint16_t verify_errors; uint16_t bad_blocks; uint32_t curr_migr_unit_hi; uint32_t filler_1[3]; struct intel_raid_map map[1]; /* 2 entries if migr_state != 0. */ } __packed; struct intel_raid_disk { #define INTEL_SERIAL_LEN 16 uint8_t serial[INTEL_SERIAL_LEN]; uint32_t sectors; uint32_t id; uint32_t flags; #define INTEL_F_SPARE 0x01 #define INTEL_F_ASSIGNED 0x02 #define INTEL_F_FAILED 0x04 #define INTEL_F_ONLINE 0x08 #define INTEL_F_DISABLED 0x80 uint32_t owner_cfg_num; uint32_t sectors_hi; uint32_t filler[3]; } __packed; struct intel_raid_conf { uint8_t intel_id[24]; #define INTEL_MAGIC "Intel Raid ISM Cfg Sig. " uint8_t version[6]; #define INTEL_VERSION_1000 "1.0.00" /* RAID0 */ #define INTEL_VERSION_1100 "1.1.00" /* RAID1 */ #define INTEL_VERSION_1200 "1.2.00" /* Many volumes */ #define INTEL_VERSION_1201 "1.2.01" /* 3 or 4 disks */ #define INTEL_VERSION_1202 "1.2.02" /* RAID5 */ #define INTEL_VERSION_1204 "1.2.04" /* 5 or 6 disks */ #define INTEL_VERSION_1206 "1.2.06" /* CNG */ #define INTEL_VERSION_1300 "1.3.00" /* Attributes */ uint8_t dummy_0[2]; uint32_t checksum; uint32_t config_size; uint32_t config_id; uint32_t generation; uint32_t error_log_size; uint32_t attributes; #define INTEL_ATTR_RAID0 0x00000001 #define INTEL_ATTR_RAID1 0x00000002 #define INTEL_ATTR_RAID10 0x00000004 #define INTEL_ATTR_RAID1E 0x00000008 #define INTEL_ATTR_RAID5 0x00000010 #define INTEL_ATTR_RAIDCNG 0x00000020 #define INTEL_ATTR_EXT_STRIP 0x00000040 #define INTEL_ATTR_NVM_CACHE 0x02000000 #define INTEL_ATTR_2TB_DISK 0x04000000 #define INTEL_ATTR_BBM 0x08000000 #define INTEL_ATTR_NVM_CACHE2 0x10000000 #define INTEL_ATTR_2TB 0x20000000 #define INTEL_ATTR_PM 0x40000000 #define INTEL_ATTR_CHECKSUM 0x80000000 uint8_t total_disks; uint8_t total_volumes; uint8_t error_log_pos; uint8_t dummy_2[1]; uint32_t cache_size; uint32_t orig_config_id; uint32_t pwr_cycle_count; uint32_t bbm_log_size; uint32_t filler_0[35]; struct intel_raid_disk disk[1]; /* total_disks entries. */ /* Here goes total_volumes of struct intel_raid_vol. */ } __packed; #define INTEL_ATTR_SUPPORTED ( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 | \ INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 | \ INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | \ INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM ) #define INTEL_MAX_MD_SIZE(ndisks) \ (sizeof(struct intel_raid_conf) + \ sizeof(struct intel_raid_disk) * (ndisks - 1) + \ sizeof(struct intel_raid_vol) * 2 + \ sizeof(struct intel_raid_map) * 2 + \ sizeof(uint32_t) * (ndisks - 1) * 4) struct g_raid_md_intel_perdisk { struct intel_raid_conf *pd_meta; int pd_disk_pos; struct intel_raid_disk pd_disk_meta; }; struct g_raid_md_intel_pervolume { int pv_volume_pos; int pv_cng; int pv_cng_man_sync; int pv_cng_master_disk; }; struct g_raid_md_intel_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; uint32_t mdio_orig_config_id; uint32_t mdio_generation; struct intel_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_intel; static g_raid_md_taste_t g_raid_md_taste_intel; static g_raid_md_event_t g_raid_md_event_intel; static g_raid_md_ctl_t g_raid_md_ctl_intel; static g_raid_md_write_t g_raid_md_write_intel; static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel; static g_raid_md_free_disk_t g_raid_md_free_disk_intel; static g_raid_md_free_volume_t g_raid_md_free_volume_intel; static g_raid_md_free_t g_raid_md_free_intel; static kobj_method_t g_raid_md_intel_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_intel), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_intel), KOBJMETHOD(g_raid_md_event, g_raid_md_event_intel), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_intel), KOBJMETHOD(g_raid_md_write, g_raid_md_write_intel), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_intel), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_intel), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_intel), KOBJMETHOD(g_raid_md_free, g_raid_md_free_intel), { 0, 0 } }; static struct g_raid_md_class g_raid_md_intel_class = { "Intel", g_raid_md_intel_methods, sizeof(struct g_raid_md_intel_object), .mdc_enable = 1, .mdc_priority = 100 }; static struct intel_raid_map * intel_get_map(struct intel_raid_vol *mvol, int i) { struct intel_raid_map *mmap; if (i > (mvol->migr_state ? 1 : 0)) return (NULL); mmap = &mvol->map[0]; for (; i > 0; i--) { mmap = (struct intel_raid_map *) &mmap->disk_idx[mmap->total_disks]; } return ((struct intel_raid_map *)mmap); } static struct intel_raid_vol * intel_get_volume(struct intel_raid_conf *meta, int i) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; if (i > 1) return (NULL); mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks]; for (; i > 0; i--) { mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0); mvol = (struct intel_raid_vol *) &mmap->disk_idx[mmap->total_disks]; } return (mvol); } static off_t intel_get_map_offset(struct intel_raid_map *mmap) { off_t offset = (off_t)mmap->offset_hi << 32; offset += mmap->offset; return (offset); } static void intel_set_map_offset(struct intel_raid_map *mmap, off_t offset) { mmap->offset = offset & 0xffffffff; mmap->offset_hi = offset >> 32; } static off_t intel_get_map_disk_sectors(struct intel_raid_map *mmap) { off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32; disk_sectors += mmap->disk_sectors; return (disk_sectors); } static void intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors) { mmap->disk_sectors = disk_sectors & 0xffffffff; mmap->disk_sectors_hi = disk_sectors >> 32; } static void intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count) { mmap->stripe_count = stripe_count & 0xffffffff; mmap->stripe_count_hi = stripe_count >> 32; } static off_t intel_get_disk_sectors(struct intel_raid_disk *disk) { off_t sectors = (off_t)disk->sectors_hi << 32; sectors += disk->sectors; return (sectors); } static void intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors) { disk->sectors = sectors & 0xffffffff; disk->sectors_hi = sectors >> 32; } static off_t intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol) { off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32; curr_migr_unit += vol->curr_migr_unit; return (curr_migr_unit); } static void intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit) { vol->curr_migr_unit = curr_migr_unit & 0xffffffff; vol->curr_migr_unit_hi = curr_migr_unit >> 32; } static char * intel_status2str(int status) { switch (status) { case INTEL_S_READY: return ("READY"); case INTEL_S_UNINITIALIZED: return ("UNINITIALIZED"); case INTEL_S_DEGRADED: return ("DEGRADED"); case INTEL_S_FAILURE: return ("FAILURE"); default: return ("UNKNOWN"); } } static char * intel_type2str(int type) { switch (type) { case INTEL_T_RAID0: return ("RAID0"); case INTEL_T_RAID1: return ("RAID1"); case INTEL_T_RAID5: return ("RAID5"); default: return ("UNKNOWN"); } } static char * intel_cngst2str(int cng_state) { switch (cng_state) { case INTEL_CNGST_UPDATED: return ("UPDATED"); case INTEL_CNGST_NEEDS_UPDATE: return ("NEEDS_UPDATE"); case INTEL_CNGST_MASTER_MISSING: return ("MASTER_MISSING"); default: return ("UNKNOWN"); } } static char * intel_mt2str(int type) { switch (type) { case INTEL_MT_INIT: return ("INIT"); case INTEL_MT_REBUILD: return ("REBUILD"); case INTEL_MT_VERIFY: return ("VERIFY"); case INTEL_MT_GEN_MIGR: return ("GEN_MIGR"); case INTEL_MT_STATE_CHANGE: return ("STATE_CHANGE"); case INTEL_MT_REPAIR: return ("REPAIR"); default: return ("UNKNOWN"); } } static void g_raid_md_intel_print(struct intel_raid_conf *meta) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; int i, j, k; if (g_raid_debug < 1) return; printf("********* ATA Intel MatrixRAID Metadata *********\n"); printf("intel_id <%.24s>\n", meta->intel_id); printf("version <%.6s>\n", meta->version); printf("checksum 0x%08x\n", meta->checksum); printf("config_size 0x%08x\n", meta->config_size); printf("config_id 0x%08x\n", meta->config_id); printf("generation 0x%08x\n", meta->generation); printf("error_log_size %d\n", meta->error_log_size); printf("attributes 0x%b\n", meta->attributes, "\020" "\001RAID0" "\002RAID1" "\003RAID10" "\004RAID1E" "\005RAID15" "\006RAIDCNG" "\007EXT_STRIP" "\032NVM_CACHE" "\0332TB_DISK" "\034BBM" "\035NVM_CACHE" "\0362TB" "\037PM" "\040CHECKSUM"); printf("total_disks %u\n", meta->total_disks); printf("total_volumes %u\n", meta->total_volumes); printf("error_log_pos %u\n", meta->error_log_pos); printf("cache_size %u\n", meta->cache_size); printf("orig_config_id 0x%08x\n", meta->orig_config_id); printf("pwr_cycle_count %u\n", meta->pwr_cycle_count); printf("bbm_log_size %u\n", meta->bbm_log_size); printf("Flags: S - Spare, A - Assigned, F - Failed, O - Online, D - Disabled\n"); printf("DISK# serial disk_sectors disk_sectors_hi disk_id flags owner\n"); for (i = 0; i < meta->total_disks; i++ ) { printf(" %d <%.16s> %u %u 0x%08x 0x%b %08x\n", i, meta->disk[i].serial, meta->disk[i].sectors, meta->disk[i].sectors_hi, meta->disk[i].id, meta->disk[i].flags, "\20\01S\02A\03F\04O\05D", meta->disk[i].owner_cfg_num); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); printf(" ****** Volume %d ******\n", i); printf(" name %.16s\n", mvol->name); printf(" total_sectors %ju\n", mvol->total_sectors); printf(" state 0x%b\n", mvol->state, "\020" "\001BOOTABLE" "\002BOOT_DEVICE" "\003READ_COALESCING" "\004WRITE_COALESCING" "\005LAST_SHUTDOWN_DIRTY" "\006HIDDEN_AT_BOOT" "\007CURRENTLY_HIDDEN" "\010VERIFY_AND_FIX" "\011MAP_STATE_UNINIT" "\012NO_AUTO_RECOVERY" "\013CLONE_N_GO" "\014CLONE_MAN_SYNC" "\015CNG_MASTER_DISK_NUM"); printf(" reserved %u\n", mvol->reserved); printf(" migr_priority %u\n", mvol->migr_priority); printf(" num_sub_vols %u\n", mvol->num_sub_vols); printf(" tid %u\n", mvol->tid); printf(" cng_master_disk %u\n", mvol->cng_master_disk); printf(" cache_policy %u\n", mvol->cache_policy); printf(" cng_state %u (%s)\n", mvol->cng_state, intel_cngst2str(mvol->cng_state)); printf(" cng_sub_state %u\n", mvol->cng_sub_state); printf(" curr_migr_unit %u\n", mvol->curr_migr_unit); printf(" curr_migr_unit_hi %u\n", mvol->curr_migr_unit_hi); printf(" checkpoint_id %u\n", mvol->checkpoint_id); printf(" migr_state %u\n", mvol->migr_state); printf(" migr_type %u (%s)\n", mvol->migr_type, intel_mt2str(mvol->migr_type)); printf(" dirty %u\n", mvol->dirty); printf(" fs_state %u\n", mvol->fs_state); printf(" verify_errors %u\n", mvol->verify_errors); printf(" bad_blocks %u\n", mvol->bad_blocks); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { printf(" *** Map %d ***\n", j); mmap = intel_get_map(mvol, j); printf(" offset %u\n", mmap->offset); printf(" offset_hi %u\n", mmap->offset_hi); printf(" disk_sectors %u\n", mmap->disk_sectors); printf(" disk_sectors_hi %u\n", mmap->disk_sectors_hi); printf(" stripe_count %u\n", mmap->stripe_count); printf(" stripe_count_hi %u\n", mmap->stripe_count_hi); printf(" strip_sectors %u\n", mmap->strip_sectors); printf(" status %u (%s)\n", mmap->status, intel_status2str(mmap->status)); printf(" type %u (%s)\n", mmap->type, intel_type2str(mmap->type)); printf(" total_disks %u\n", mmap->total_disks); printf(" total_domains %u\n", mmap->total_domains); printf(" failed_disk_num %u\n", mmap->failed_disk_num); printf(" ddf %u\n", mmap->ddf); printf(" disk_idx "); for (k = 0; k < mmap->total_disks; k++) printf(" 0x%08x", mmap->disk_idx[k]); printf("\n"); } } printf("=================================================\n"); } static struct intel_raid_conf * intel_meta_copy(struct intel_raid_conf *meta) { struct intel_raid_conf *nmeta; nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); memcpy(nmeta, meta, meta->config_size); return (nmeta); } static int intel_meta_find_disk(struct intel_raid_conf *meta, char *serial) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (strncmp(meta->disk[pos].serial, serial, INTEL_SERIAL_LEN) == 0) return (pos); } return (-1); } static struct intel_raid_conf * intel_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap, *mmap1; char *buf; int error, i, j, k, left, size; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct intel_raid_conf *)buf; /* Check if this is an Intel RAID struct */ if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) { G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 65536 || meta->config_size < sizeof(struct intel_raid_conf)) { G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } size = meta->config_size; meta = malloc(size, M_MD_INTEL, M_WAITOK); memcpy(meta, buf, min(size, pp->sectorsize)); g_free(buf); /* Read all the rest, if needed. */ if (meta->config_size > pp->sectorsize) { left = (meta->config_size - 1) / pp->sectorsize; buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (2 + left), pp->sectorsize * left, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read remaining metadata" " part from %s (error=%d).", pp->name, error); free(meta, M_MD_INTEL); return (NULL); } memcpy(((char *)meta) + pp->sectorsize, buf, pp->sectorsize * left); g_free(buf); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } checksum -= meta->checksum; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name); free(meta, M_MD_INTEL); return (NULL); } /* Validate metadata size. */ size = sizeof(struct intel_raid_conf) + sizeof(struct intel_raid_disk) * (meta->total_disks - 1) + sizeof(struct intel_raid_vol) * meta->total_volumes; if (size > meta->config_size) { badsize: G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d", meta->config_size, size); free(meta, M_MD_INTEL); return (NULL); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; if (mvol->migr_state) { size += sizeof(struct intel_raid_map); if (size > meta->config_size) goto badsize; mmap = intel_get_map(mvol, 1); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; } } g_raid_md_intel_print(meta); if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) { G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'", meta->version); free(meta, M_MD_INTEL); return (NULL); } if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 && (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) { G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x", meta->attributes & ~INTEL_ATTR_SUPPORTED); free(meta, M_MD_INTEL); return (NULL); } /* Validate disk indexes. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { mmap = intel_get_map(mvol, j); for (k = 0; k < mmap->total_disks; k++) { if ((mmap->disk_idx[k] & INTEL_DI_IDX) > meta->total_disks) { G_RAID_DEBUG(1, "Intel metadata disk" " index %d too big (>%d)", mmap->disk_idx[k] & INTEL_DI_IDX, meta->total_disks); free(meta, M_MD_INTEL); return (NULL); } } } } /* Validate migration types. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); /* Deny unknown migration types. */ if (mvol->migr_state && mvol->migr_type != INTEL_MT_INIT && mvol->migr_type != INTEL_MT_REBUILD && mvol->migr_type != INTEL_MT_VERIFY && mvol->migr_type != INTEL_MT_GEN_MIGR && mvol->migr_type != INTEL_MT_REPAIR) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " migration type %d", mvol->migr_type); free(meta, M_MD_INTEL); return (NULL); } /* Deny general migrations except SINGLE->RAID1. */ if (mvol->migr_state && mvol->migr_type == INTEL_MT_GEN_MIGR) { mmap = intel_get_map(mvol, 0); mmap1 = intel_get_map(mvol, 1); if (mmap1->total_disks != 1 || mmap->type != INTEL_T_RAID1 || mmap->total_disks != 2 || mmap->offset != mmap1->offset || mmap->disk_sectors != mmap1->disk_sectors || mmap->total_domains != mmap->total_disks || mmap->offset_hi != mmap1->offset_hi || mmap->disk_sectors_hi != mmap1->disk_sectors_hi || (mmap->disk_idx[0] != mmap1->disk_idx[0] && mmap->disk_idx[0] != mmap1->disk_idx[1])) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " variant of general migration"); free(meta, M_MD_INTEL); return (NULL); } } } return (meta); } static int intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i, sectors; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } meta->checksum = checksum; /* Create and fill buffer. */ sectors = howmany(meta->config_size, pp->sectorsize); buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); if (sectors > 1) { memcpy(buf, ((char *)meta) + pp->sectorsize, (sectors - 1) * pp->sectorsize); } memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize); error = g_write_data(cp, pp->mediasize - pp->sectorsize * (1 + sectors), buf, pp->sectorsize * sectors); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d) { struct intel_raid_conf *meta; int error; /* Fill anchor and single disk. */ meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); memcpy(&meta->version[0], INTEL_VERSION_1000, sizeof(INTEL_VERSION_1000) - 1); meta->config_size = INTEL_MAX_MD_SIZE(1); meta->config_id = meta->orig_config_id = arc4random(); meta->generation = 1; meta->total_disks = 1; meta->disk[0] = *d; error = intel_meta_write(cp, meta); free(meta, M_MD_INTEL); return (error); } static struct g_raid_disk * g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_intel_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_intel_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (!force && disks > 6) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static struct g_raid_volume * g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id) { struct g_raid_volume *mvol; struct g_raid_md_intel_pervolume *pv; TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) { pv = mvol->v_md_data; if (pv->pv_volume_pos == id) break; } return (mvol); } static int g_raid_md_intel_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd, *oldpd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; int disk_pos, resurrection = 0, migr_global, i; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* Failed stale disk is useless for us. */ if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) && !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { off_t disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (sd->sd_offset + sd->sd_size + 4096 > disk_sectors * 512) { G_RAID_DEBUG1(1, sc, "Disk too small (%llu < %llu)", (unsigned long long) disk_sectors * 512, (unsigned long long) sd->sd_offset + sd->sd_size + 4096); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (pd->pd_disk_meta.flags & INTEL_F_SPARE) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_intel_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); /* Update global metadata just in case. */ memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta, sizeof(struct intel_raid_disk)); } /* Welcome the new disk. */ if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED); else if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else if (meta->disk[disk_pos].flags & INTEL_F_SPARE) g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { pv = sd->sd_volume->v_md_data; mvol = intel_get_volume(meta, pv->pv_volume_pos); mmap0 = intel_get_map(mvol, 0); if (mvol->migr_state) mmap1 = intel_get_map(mvol, 1); else mmap1 = mmap0; migr_global = 1; for (i = 0; i < mmap0->total_disks; i++) { if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 && (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0) migr_global = 0; } if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) { /* Disabled disk, useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); } else if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) { /* Failed disk, almost useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (mvol->migr_state == 0) { if (mmap0->status == INTEL_S_UNINITIALIZED && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_INIT || mvol->migr_type == INTEL_MT_REBUILD) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->migr_type == INTEL_MT_INIT && migr_global) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_VERIFY || mvol->migr_type == INTEL_MT_REPAIR) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) || migr_global) { /* Resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->dirty) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_GEN_MIGR) { if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); } return (resurrection); } static void g_disk_md_intel_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_INTEL); } static void g_raid_md_intel_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *meta; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED); if (na == meta->total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, meta->total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE || disk->d_state == G_RAID_DISK_S_DISABLED) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) { g_raid_md_write_intel(md, NULL, NULL, NULL); meta = mdi->mdio_meta; } /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_INTEL, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_intel_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_intel_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, j, disk_pos; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0; pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0; if (mvol->cng_master_disk < mmap->total_disks) pv->pv_cng_master_disk = mvol->cng_master_disk; vol->v_md_data = pv; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (mmap->type == INTEL_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (mmap->type == INTEL_T_RAID1 && mmap->total_domains >= 2 && mmap->total_domains <= mmap->total_disks) { /* Assume total_domains is correct. */ if (mmap->total_domains == mmap->total_disks) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID1) { /* total_domains looks wrong. */ if (mmap->total_disks <= 2) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ vol->v_disks_count = mmap->total_disks; vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ } g_raid_start_volume(vol); } /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_meta = meta->disk[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); for (j = 0; j < mmap->total_disks; j++) { if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos) break; } if (j == mmap->total_disks) continue; vol = g_raid_md_intel_get_volume(sc, i); sd = &vol->v_subdisks[j]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_intel_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_intel_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *pdmeta; struct g_raid_md_intel_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_intel_start_disk(disk)) g_raid_md_write_intel(md, NULL, NULL, NULL); } else { /* If we haven't started yet - check metadata freshness. */ if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = intel_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks) g_raid_md_intel_start(sc); } } static void g_raid_intel_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; char name[16]; mdi = (struct g_raid_md_intel_object *)md; mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } /* * Return the last N characters of the serial label. The Linux and * ataraid(7) code always uses the last 16 characters of the label to * store into the Intel meta format. Generalize this to N characters * since that's easy. Labels can be up to 20 characters for SATA drives * and up 251 characters for SAS drives. Since intel controllers don't * support SAS drives, just stick with the SATA limits for stack friendliness. */ static int g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen) { char serial_buffer[24]; int len, error; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer); if (error != 0) return (error); len = strlen(serial_buffer); if (len > serlen) len -= serlen; else len = 0; strncpy(serial, serial_buffer + len, serlen); return (0); } static int g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_intel_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct intel_raid_conf *meta; struct g_raid_md_intel_perdisk *pd; struct g_geom *geom; int error, disk_pos, result, spare, len; char serial[INTEL_SERIAL_LEN]; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name); mdi = (struct g_raid_md_intel_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; disk_pos = 0; g_topology_unlock(); error = g_raid_md_get_label(cp, serial, sizeof(serial)); if (error != 0) { G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).", pp->name, error); goto fail2; } vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = intel_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor != 0x8086) { G_RAID_DEBUG(1, "Intel vendor mismatch 0x%04x != 0x8086", vendor); } else { G_RAID_DEBUG(1, "No Intel metadata, forcing spare."); spare = 2; goto search; } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = intel_meta_find_disk(meta, serial); if (disk_pos < 0) { G_RAID_DEBUG(1, "Intel serial '%s' not found", serial); goto fail1; } if (intel_get_disk_sectors(&meta->disk[disk_pos]) != (pp->mediasize / pp->sectorsize)) { G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju", intel_get_disk_sectors(&meta->disk[disk_pos]), (off_t)(pp->mediasize / pp->sectorsize)); goto fail1; } G_RAID_DEBUG(1, "Intel disk position %d", disk_pos); spare = meta->disk[disk_pos].flags & INTEL_F_SPARE; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_intel_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == meta->config_id) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = meta->config_id; mdi->mdio_orig_config_id = meta->orig_config_id; snprintf(name, sizeof(name), "Intel-%08x", meta->config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_intel_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-Intel"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_meta = meta; pd->pd_disk_pos = -1; if (spare == 2) { memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; } else { pd->pd_disk_meta = meta->disk[disk_pos]; } disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_intel_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail2: g_topology_lock(); fail1: free(meta, M_MD_INTEL); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_intel(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_intel_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_intel(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16], serial[INTEL_SERIAL_LEN]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t off, size, sectorsize, strip, disk_sectors; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) { strcpy(&pd->pd_disk_meta.serial[0], "NONE"); pd->pd_disk_meta.id = 0xffffffff; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; continue; } cp->private = disk; g_topology_unlock(); error = g_raid_md_get_label(cp, &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); error = -8; break; } g_raid_get_disk_info(disk); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = 0; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { if (*nargs != 3) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } /* Look for existing volumes. */ i = 0; vol1 = NULL; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { vol1 = vol; i++; } if (i > 1) { gctl_error(req, "Maximum two volumes supported."); return (-6); } if (vol1 == NULL) { gctl_error(req, "At least one volume must exist."); return (-7); } numdisks = vol1->v_disks_count; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Collect info about present disks. */ size = 0x7fffffffffffffffllu; sectorsize = 512; for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; pd = (struct g_raid_md_intel_perdisk *) disk->d_md_data; disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (disk_sectors * 512 < size) size = disk_sectors * 512; if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && disk->d_consumer->provider->sectorsize > sectorsize) { sectorsize = disk->d_consumer->provider->sectorsize; } } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Decide insert before or after. */ sd = &vol1->v_subdisks[0]; if (sd->sd_offset > size - (sd->sd_offset + sd->sd_size)) { off = 0; size = sd->sd_offset; } else { off = sd->sd_offset + sd->sd_size; size = size - (sd->sd_offset + sd->sd_size); } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round offset up to strip. */ if (off % strip != 0) { size -= strip - off % strip; off += strip - off % strip; } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = off; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (disk->d_state == G_RAID_DISK_S_ACTIVE) { if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } } /* Write metadata based on created entities. */ g_raid_md_write_intel(md, NULL, NULL, NULL); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_write_intel(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_intel(md, NULL, disk); continue; } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ intel_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); /* Read disk serial. */ error = g_raid_md_get_label(cp, &serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); g_raid_kill_consumer(sc, cp); error = -7; break; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = -1; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); memcpy(&pd->pd_disk_meta.serial[0], &serial[0], INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; /* Welcome the "new" disk. */ update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { intel_meta_write_spare(cp, &pd->pd_disk_meta); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_intel(md, NULL, NULL, NULL); return (error); } return (-100); } static int g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; off_t sectorsize = 512, pos; const char *version, *cv; int vi, sdi, numdisks, len, state, stale; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* Count number of disks. */ numdisks = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; numdisks++; if (disk->d_state == G_RAID_DISK_S_ACTIVE) { pd->pd_disk_meta.flags = INTEL_F_ONLINE | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_FAILED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_DISABLED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED | INTEL_F_DISABLED; } else { if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; if (pd->pd_disk_meta.id != 0xffffffff) { pd->pd_disk_meta.id = 0xffffffff; len = strlen(pd->pd_disk_meta.serial); len = min(len, INTEL_SERIAL_LEN - 3); strcpy(pd->pd_disk_meta.serial + len, ":0"); } } } /* Fill anchor and disks. */ meta = malloc(INTEL_MAX_MD_SIZE(numdisks), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); meta->config_size = INTEL_MAX_MD_SIZE(numdisks); meta->config_id = mdi->mdio_config_id; meta->orig_config_id = mdi->mdio_orig_config_id; meta->generation = mdi->mdio_generation; meta->attributes = INTEL_ATTR_CHECKSUM; meta->total_disks = numdisks; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta; if (pd->pd_disk_meta.sectors_hi != 0) meta->attributes |= INTEL_ATTR_2TB_DISK; } /* Fill volumes and maps. */ vi = 0; version = INTEL_VERSION_1000; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (vol->v_stopping) continue; mvol = intel_get_volume(meta, vi); /* New metadata may have different volumes order. */ pv->pv_volume_pos = vi; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_disk != NULL) break; } if (sdi >= vol->v_disks_count) panic("No any filled subdisk in volume"); if (vol->v_mediasize >= 0x20000000000llu) meta->attributes |= INTEL_ATTR_2TB; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->attributes |= INTEL_ATTR_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->attributes |= INTEL_ATTR_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->attributes |= INTEL_ATTR_RAID5; else if ((vol->v_disks_count & 1) == 0) meta->attributes |= INTEL_ATTR_RAID10; else meta->attributes |= INTEL_ATTR_RAID1E; if (pv->pv_cng) meta->attributes |= INTEL_ATTR_RAIDCNG; if (vol->v_strip_size > 131072) meta->attributes |= INTEL_ATTR_EXT_STRIP; if (pv->pv_cng) cv = INTEL_VERSION_1206; else if (vol->v_disks_count > 4) cv = INTEL_VERSION_1204; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) cv = INTEL_VERSION_1202; else if (vol->v_disks_count > 2) cv = INTEL_VERSION_1201; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) cv = INTEL_VERSION_1100; else cv = INTEL_VERSION_1000; if (strcmp(cv, version) > 0) version = cv; strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name)); mvol->total_sectors = vol->v_mediasize / sectorsize; mvol->state = (INTEL_ST_READ_COALESCING | INTEL_ST_WRITE_COALESCING); mvol->tid = vol->v_global_id + 1; if (pv->pv_cng) { mvol->state |= INTEL_ST_CLONE_N_GO; if (pv->pv_cng_man_sync) mvol->state |= INTEL_ST_CLONE_MAN_SYNC; mvol->cng_master_disk = pv->pv_cng_master_disk; if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state == G_RAID_SUBDISK_S_NONE) mvol->cng_state = INTEL_CNGST_MASTER_MISSING; else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE; else mvol->cng_state = INTEL_CNGST_UPDATED; } /* Check for any recovery in progress. */ state = G_RAID_SUBDISK_S_ACTIVE; pos = 0x7fffffffffffffffllu; stale = 0; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_REBUILD; else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC && state != G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_RESYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_STALE) stale = 1; if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos < pos) pos = sd->sd_rebuild_pos; } if (state == G_RAID_SUBDISK_S_REBUILD) { mvol->migr_state = 1; mvol->migr_type = INTEL_MT_REBUILD; } else if (state == G_RAID_SUBDISK_S_RESYNC) { mvol->migr_state = 1; /* mvol->migr_type = INTEL_MT_REPAIR; */ mvol->migr_type = INTEL_MT_VERIFY; mvol->state |= INTEL_ST_VERIFY_AND_FIX; } else mvol->migr_state = 0; mvol->dirty = (vol->v_dirty || stale); mmap0 = intel_get_map(mvol, 0); /* Write map / common part of two maps. */ intel_set_map_offset(mmap0, sd->sd_offset / sectorsize); intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize); mmap0->strip_sectors = vol->v_strip_size / sectorsize; if (vol->v_state == G_RAID_VOLUME_S_BROKEN) mmap0->status = INTEL_S_FAILURE; else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED) mmap0->status = INTEL_S_DEGRADED; else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) == g_raid_nsubdisks(vol, -1)) mmap0->status = INTEL_S_UNINITIALIZED; else mmap0->status = INTEL_S_READY; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) mmap0->type = INTEL_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->type = INTEL_T_RAID1; else mmap0->type = INTEL_T_RAID5; mmap0->total_disks = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) mmap0->total_domains = vol->v_disks_count; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->total_domains = 2; else mmap0->total_domains = 1; intel_set_map_stripe_count(mmap0, sd->sd_size / vol->v_strip_size / mmap0->total_domains); mmap0->failed_disk_num = 0xff; mmap0->ddf = 1; /* If there are two maps - copy common and update. */ if (mvol->migr_state) { intel_set_vol_curr_migr_unit(mvol, pos / vol->v_strip_size / mmap0->total_domains); mmap1 = intel_get_map(mvol, 1); memcpy(mmap1, mmap0, sizeof(struct intel_raid_map)); mmap0->status = INTEL_S_READY; } else mmap1 = NULL; /* Write disk indexes and put rebuild flags. */ for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; pd = (struct g_raid_md_intel_perdisk *) sd->sd_disk->d_md_data; mmap0->disk_idx[sdi] = pd->pd_disk_pos; if (mvol->migr_state) mmap1->disk_idx[sdi] = pd->pd_disk_pos; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && sd->sd_state != G_RAID_SUBDISK_S_STALE && sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) { mmap0->disk_idx[sdi] |= INTEL_DI_RBLD; if (mvol->migr_state) mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } if ((sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) && mmap0->failed_disk_num == 0xff) { mmap0->failed_disk_num = sdi; if (mvol->migr_state) mmap1->failed_disk_num = sdi; } } vi++; } meta->total_volumes = vi; if (vi > 1 || meta->attributes & (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB)) version = INTEL_VERSION_1300; if (strcmp(version, INTEL_VERSION_1300) < 0) meta->attributes &= INTEL_ATTR_CHECKSUM; memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1); /* We are done. Print meta data and store them to disks. */ g_raid_md_intel_print(meta); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } pd->pd_meta = intel_meta_copy(meta); intel_meta_write(disk->d_consumer, meta); } return (0); } static int g_raid_md_fail_disk_intel(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED; pd->pd_disk_meta.flags = INTEL_F_FAILED; g_raid_md_intel_print(mdi->mdio_meta); if (tdisk->d_consumer) intel_meta_write(tdisk->d_consumer, mdi->mdio_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } static int g_raid_md_free_disk_intel(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_intel_perdisk *pd; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } free(pd, M_MD_INTEL); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_intel(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_intel_pervolume *pv; pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data; free(pv, M_MD_INTEL); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_intel(struct g_raid_md_object *md) { struct g_raid_md_intel_object *mdi; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(intel, "Intel"); Index: head/sys/geom/raid/md_jmicron.c =================================================================== --- head/sys/geom/raid/md_jmicron.c (revision 326269) +++ head/sys/geom/raid/md_jmicron.c (revision 326270) @@ -1,1563 +1,1565 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_JMICRON, "md_jmicron_data", "GEOM_RAID JMicron metadata"); #define JMICRON_MAX_DISKS 8 #define JMICRON_MAX_SPARE 2 struct jmicron_raid_conf { u_int8_t signature[2]; #define JMICRON_MAGIC "JM" u_int16_t version; #define JMICRON_VERSION 0x0001 u_int16_t checksum; u_int8_t filler_1[10]; u_int32_t disk_id; u_int32_t offset; u_int32_t disk_sectors_high; u_int16_t disk_sectors_low; u_int8_t filler_2[2]; u_int8_t name[16]; u_int8_t type; #define JMICRON_T_RAID0 0 #define JMICRON_T_RAID1 1 #define JMICRON_T_RAID01 2 #define JMICRON_T_CONCAT 3 #define JMICRON_T_RAID5 5 u_int8_t stripe_shift; u_int16_t flags; #define JMICRON_F_READY 0x0001 #define JMICRON_F_BOOTABLE 0x0002 #define JMICRON_F_BADSEC 0x0004 #define JMICRON_F_ACTIVE 0x0010 #define JMICRON_F_UNSYNC 0x0020 #define JMICRON_F_NEWEST 0x0040 u_int8_t filler_3[4]; u_int32_t spare[JMICRON_MAX_SPARE]; u_int32_t disks[JMICRON_MAX_DISKS]; #define JMICRON_DISK_MASK 0xFFFFFFF0 #define JMICRON_SEG_MASK 0x0000000F u_int8_t filler_4[32]; u_int8_t filler_5[384]; }; struct g_raid_md_jmicron_perdisk { struct jmicron_raid_conf *pd_meta; int pd_disk_pos; int pd_disk_id; off_t pd_disk_size; }; struct g_raid_md_jmicron_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; struct jmicron_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_jmicron; static g_raid_md_taste_t g_raid_md_taste_jmicron; static g_raid_md_event_t g_raid_md_event_jmicron; static g_raid_md_ctl_t g_raid_md_ctl_jmicron; static g_raid_md_write_t g_raid_md_write_jmicron; static g_raid_md_fail_disk_t g_raid_md_fail_disk_jmicron; static g_raid_md_free_disk_t g_raid_md_free_disk_jmicron; static g_raid_md_free_t g_raid_md_free_jmicron; static kobj_method_t g_raid_md_jmicron_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_jmicron), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_jmicron), KOBJMETHOD(g_raid_md_event, g_raid_md_event_jmicron), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_jmicron), KOBJMETHOD(g_raid_md_write, g_raid_md_write_jmicron), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_jmicron), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_jmicron), KOBJMETHOD(g_raid_md_free, g_raid_md_free_jmicron), { 0, 0 } }; static struct g_raid_md_class g_raid_md_jmicron_class = { "JMicron", g_raid_md_jmicron_methods, sizeof(struct g_raid_md_jmicron_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_jmicron_print(struct jmicron_raid_conf *meta) { int k; if (g_raid_debug < 1) return; printf("********* ATA JMicron RAID Metadata *********\n"); printf("signature <%c%c>\n", meta->signature[0], meta->signature[1]); printf("version %04x\n", meta->version); printf("checksum 0x%04x\n", meta->checksum); printf("disk_id 0x%08x\n", meta->disk_id); printf("offset 0x%08x\n", meta->offset); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_sectors_low 0x%04x\n", meta->disk_sectors_low); printf("name <%.16s>\n", meta->name); printf("type %d\n", meta->type); printf("stripe_shift %d\n", meta->stripe_shift); printf("flags %04x\n", meta->flags); printf("spare "); for (k = 0; k < JMICRON_MAX_SPARE; k++) printf(" 0x%08x", meta->spare[k]); printf("\n"); printf("disks "); for (k = 0; k < JMICRON_MAX_DISKS; k++) printf(" 0x%08x", meta->disks[k]); printf("\n"); printf("=================================================\n"); } static struct jmicron_raid_conf * jmicron_meta_copy(struct jmicron_raid_conf *meta) { struct jmicron_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int jmicron_meta_total_disks(struct jmicron_raid_conf *meta) { int pos; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if (meta->disks[pos] == 0) break; } return (pos); } static int jmicron_meta_total_spare(struct jmicron_raid_conf *meta) { int pos, n; n = 0; for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if (meta->spare[pos] != 0) n++; } return (n); } /* * Generate fake Configuration ID based on disk IDs. * Note: it will change after each disk set change. */ static uint32_t jmicron_meta_config_id(struct jmicron_raid_conf *meta) { int pos; uint32_t config_id; config_id = 0; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) config_id += meta->disks[pos] << pos; return (config_id); } static void jmicron_meta_get_name(struct jmicron_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void jmicron_meta_put_name(struct jmicron_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static int jmicron_meta_find_disk(struct jmicron_raid_conf *meta, uint32_t id) { int pos; id &= JMICRON_DISK_MASK; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if ((meta->disks[pos] & JMICRON_DISK_MASK) == id) return (pos); } for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if ((meta->spare[pos] & JMICRON_DISK_MASK) == id) return (-3); } return (-1); } static struct jmicron_raid_conf * jmicron_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct jmicron_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct jmicron_raid_conf *)buf; /* Check if this is an JMicron RAID struct */ if (strncmp(meta->signature, JMICRON_MAGIC, strlen(JMICRON_MAGIC))) { G_RAID_DEBUG(1, "JMicron signature check failed on %s", pp->name); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "JMicron checksum check failed on %s", pp->name); free(meta, M_MD_JMICRON); return (NULL); } return (meta); } static int jmicron_meta_write(struct g_consumer *cp, struct jmicron_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static int jmicron_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static struct g_raid_disk * g_raid_md_jmicron_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_jmicron_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_jmicron_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); if (!force) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); if (!force) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_jmicron_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd, *oldpd; struct jmicron_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = jmicron_meta_find_disk(meta, pd->pd_disk_id); else disk_pos = -1; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_jmicron_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* Update global metadata just in case. */ meta->disks[disk_pos] = pd->pd_disk_id; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes/offsets, * especially in concat mode. Update. */ if (!resurrection) { sd->sd_offset = (off_t)pd->pd_meta->offset * 16 * 512; //ZZZ sd->sd_size = (((off_t)pd->pd_meta->disk_sectors_high << 16) + pd->pd_meta->disk_sectors_low) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((meta->flags & JMICRON_F_BADSEC) != 0 && (pd->pd_meta->flags & JMICRON_F_BADSEC) == 0) { /* Cold-inserted or rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->flags & JMICRON_F_UNSYNC) { /* Dirty or resyncing disk.. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_jmicron_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_JMICRON); } static void g_raid_md_jmicron_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_JMICRON, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_jmicron_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_jmicron_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ jmicron_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); size = ((off_t)meta->disk_sectors_high << 16) + meta->disk_sectors_low; size *= 512; //ZZZ vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == JMICRON_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; vol->v_mediasize = size * mdi->mdio_total_disks; } else if (meta->type == JMICRON_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; vol->v_mediasize = size; } else if (meta->type == JMICRON_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_mediasize = size * mdi->mdio_total_disks / 2; } else if (meta->type == JMICRON_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; vol->v_mediasize = 0; } else if (meta->type == JMICRON_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; vol->v_mediasize = size * (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_mediasize = 0; } vol->v_strip_size = 1024 << meta->stripe_shift; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = (off_t)meta->offset * 16 * 512; //ZZZ sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_id = meta->disks[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_jmicron_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_jmicron_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct jmicron_raid_conf *pdmeta; struct g_raid_md_jmicron_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_jmicron_start_disk(disk)) g_raid_md_write_jmicron(md, NULL, NULL, NULL); } else { /* * If we haven't started yet - update common metadata * to get subdisks details, avoiding data from spare disks. */ if (mdi->mdio_meta == NULL || jmicron_meta_find_disk(mdi->mdio_meta, mdi->mdio_meta->disk_id) == -3) { if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = jmicron_meta_copy(pdmeta); mdi->mdio_total_disks = jmicron_meta_total_disks(pdmeta); } mdi->mdio_meta->flags |= pdmeta->flags & JMICRON_F_BADSEC; mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d+%d up)", mdi->mdio_disks_present, mdi->mdio_total_disks, jmicron_meta_total_spare(mdi->mdio_meta)); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks + jmicron_meta_total_spare(mdi->mdio_meta)) g_raid_md_jmicron_start(sc); } } static void g_raid_jmicron_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_jmicron_object *mdi; char name[16]; mdi = (struct g_raid_md_jmicron_object *)md; mdi->mdio_config_id = arc4random(); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_jmicron_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct jmicron_raid_conf *meta; struct g_raid_md_jmicron_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting JMicron on %s", cp->provider->name); mdi = (struct g_raid_md_jmicron_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = jmicron_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x197b) { G_RAID_DEBUG(1, "No JMicron metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "JMicron vendor mismatch 0x%04x != 0x197b", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = jmicron_meta_find_disk(meta, meta->disk_id); if (disk_pos == -1) { G_RAID_DEBUG(1, "JMicron disk_id %08x not found", meta->disk_id); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_jmicron_print(meta); G_RAID_DEBUG(1, "JMicron disk position %d", disk_pos); spare = (disk_pos == -2) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_jmicron_object *)sc->sc_md; if (spare == 2) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == jmicron_meta_config_id(meta)) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = jmicron_meta_config_id(meta); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_jmicron_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-JMicron"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; } else { pd->pd_disk_pos = -1; pd->pd_disk_id = meta->disk_id; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_jmicron_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_JMICRON); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_jmicron_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_jmicron(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_jmicron_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) jmicron_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_jmicron(md, NULL, disk); continue; } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ jmicron_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_jmicron(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK | M_ZERO); strncpy(meta->signature, JMICRON_MAGIC, 2); meta->version = JMICRON_VERSION; jmicron_meta_put_name(meta, vol->v_name); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = JMICRON_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = JMICRON_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = JMICRON_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = JMICRON_T_CONCAT; else meta->type = JMICRON_T_RAID5; meta->stripe_shift = fls(vol->v_strip_size / 2048); meta->flags = JMICRON_F_READY | JMICRON_F_BOOTABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == NULL || sd->sd_disk->d_md_data == NULL) meta->disks[i] = 0xffffffff; else { pd = (struct g_raid_md_jmicron_perdisk *) sd->sd_disk->d_md_data; meta->disks[i] = pd->pd_disk_id; } if (sd->sd_state < G_RAID_SUBDISK_S_STALE) meta->flags |= JMICRON_F_BADSEC; if (vol->v_dirty) meta->flags |= JMICRON_F_UNSYNC; } /* Put spares to their slots. */ spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_SPARE) continue; meta->spare[spares] = pd->pd_disk_id; if (++spares >= 2) break; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } pd->pd_meta = jmicron_meta_copy(meta); pd->pd_meta->disk_id = pd->pd_disk_id; if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { pd->pd_meta->offset = (sd->sd_offset / 512) / 16; pd->pd_meta->disk_sectors_high = (sd->sd_size / 512) >> 16; pd->pd_meta->disk_sectors_low = (sd->sd_size / 512) & 0xffff; if (sd->sd_state < G_RAID_SUBDISK_S_STALE) pd->pd_meta->flags &= ~JMICRON_F_BADSEC; else if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) pd->pd_meta->flags |= JMICRON_F_UNSYNC; } G_RAID_DEBUG(1, "Writing JMicron metadata to %s", g_raid_get_diskname(disk)); g_raid_md_jmicron_print(pd->pd_meta); jmicron_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_jmicron(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_jmicron_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_jmicron_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); if (tdisk->d_consumer) jmicron_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } static int g_raid_md_free_disk_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_jmicron_perdisk *pd; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } free(pd, M_MD_JMICRON); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_jmicron(struct g_raid_md_object *md) { struct g_raid_md_jmicron_object *mdi; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(jmicron, "JMicron"); Index: head/sys/geom/raid/md_nvidia.c =================================================================== --- head/sys/geom/raid/md_nvidia.c (revision 326269) +++ head/sys/geom/raid/md_nvidia.c (revision 326270) @@ -1,1583 +1,1585 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_NVIDIA, "md_nvidia_data", "GEOM_RAID NVIDIA metadata"); struct nvidia_raid_conf { uint8_t nvidia_id[8]; #define NVIDIA_MAGIC "NVIDIA " uint32_t config_size; uint32_t checksum; uint16_t version; uint8_t disk_number; uint8_t dummy_0; uint32_t total_sectors; uint32_t sector_size; uint8_t name[16]; uint8_t revision[4]; uint32_t disk_status; uint32_t magic_0; #define NVIDIA_MAGIC0 0x00640044 uint64_t volume_id[2]; uint8_t state; #define NVIDIA_S_IDLE 0 #define NVIDIA_S_INIT 2 #define NVIDIA_S_REBUILD 3 #define NVIDIA_S_UPGRADE 4 #define NVIDIA_S_SYNC 5 uint8_t array_width; uint8_t total_disks; uint8_t orig_array_width; uint16_t type; #define NVIDIA_T_RAID0 0x0080 #define NVIDIA_T_RAID1 0x0081 #define NVIDIA_T_RAID3 0x0083 #define NVIDIA_T_RAID5 0x0085 /* RLQ = 00/02? */ #define NVIDIA_T_RAID5_SYM 0x0095 /* RLQ = 03 */ #define NVIDIA_T_RAID10 0x008a #define NVIDIA_T_RAID01 0x8180 #define NVIDIA_T_CONCAT 0x00ff uint16_t dummy_3; uint32_t strip_sectors; uint32_t strip_bytes; uint32_t strip_shift; uint32_t strip_mask; uint32_t stripe_sectors; uint32_t stripe_bytes; uint32_t rebuild_lba; uint32_t orig_type; uint32_t orig_total_sectors; uint32_t status; #define NVIDIA_S_BOOTABLE 0x00000001 #define NVIDIA_S_DEGRADED 0x00000002 uint32_t filler[98]; } __packed; struct g_raid_md_nvidia_perdisk { struct nvidia_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_nvidia_object { struct g_raid_md_object mdio_base; uint64_t mdio_volume_id[2]; struct nvidia_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_nvidia; static g_raid_md_taste_t g_raid_md_taste_nvidia; static g_raid_md_event_t g_raid_md_event_nvidia; static g_raid_md_ctl_t g_raid_md_ctl_nvidia; static g_raid_md_write_t g_raid_md_write_nvidia; static g_raid_md_fail_disk_t g_raid_md_fail_disk_nvidia; static g_raid_md_free_disk_t g_raid_md_free_disk_nvidia; static g_raid_md_free_t g_raid_md_free_nvidia; static kobj_method_t g_raid_md_nvidia_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_nvidia), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_nvidia), KOBJMETHOD(g_raid_md_event, g_raid_md_event_nvidia), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_nvidia), KOBJMETHOD(g_raid_md_write, g_raid_md_write_nvidia), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_nvidia), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_nvidia), KOBJMETHOD(g_raid_md_free, g_raid_md_free_nvidia), { 0, 0 } }; static struct g_raid_md_class g_raid_md_nvidia_class = { "NVIDIA", g_raid_md_nvidia_methods, sizeof(struct g_raid_md_nvidia_object), .mdc_enable = 1, .mdc_priority = 100 }; static int NVIDIANodeID = 1; static void g_raid_md_nvidia_print(struct nvidia_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA NVIDIA RAID Metadata *********\n"); printf("nvidia_id <%.8s>\n", meta->nvidia_id); printf("config_size %u\n", meta->config_size); printf("checksum 0x%08x\n", meta->checksum); printf("version 0x%04x\n", meta->version); printf("disk_number %d\n", meta->disk_number); printf("dummy_0 0x%02x\n", meta->dummy_0); printf("total_sectors %u\n", meta->total_sectors); printf("sector_size %u\n", meta->sector_size); printf("name <%.16s>\n", meta->name); printf("revision 0x%02x%02x%02x%02x\n", meta->revision[0], meta->revision[1], meta->revision[2], meta->revision[3]); printf("disk_status 0x%08x\n", meta->disk_status); printf("magic_0 0x%08x\n", meta->magic_0); printf("volume_id 0x%016jx%016jx\n", meta->volume_id[1], meta->volume_id[0]); printf("state 0x%02x\n", meta->state); printf("array_width %u\n", meta->array_width); printf("total_disks %u\n", meta->total_disks); printf("orig_array_width %u\n", meta->orig_array_width); printf("type 0x%04x\n", meta->type); printf("dummy_3 0x%04x\n", meta->dummy_3); printf("strip_sectors %u\n", meta->strip_sectors); printf("strip_bytes %u\n", meta->strip_bytes); printf("strip_shift %u\n", meta->strip_shift); printf("strip_mask 0x%08x\n", meta->strip_mask); printf("stripe_sectors %u\n", meta->stripe_sectors); printf("stripe_bytes %u\n", meta->stripe_bytes); printf("rebuild_lba %u\n", meta->rebuild_lba); printf("orig_type 0x%04x\n", meta->orig_type); printf("orig_total_sectors %u\n", meta->orig_total_sectors); printf("status 0x%08x\n", meta->status); printf("=================================================\n"); } static struct nvidia_raid_conf * nvidia_meta_copy(struct nvidia_raid_conf *meta) { struct nvidia_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int nvidia_meta_translate_disk(struct nvidia_raid_conf *meta, int md_disk_pos) { int disk_pos; if (md_disk_pos >= 0 && meta->type == NVIDIA_T_RAID01) { disk_pos = (md_disk_pos / meta->array_width) + (md_disk_pos % meta->array_width) * meta->array_width; } else disk_pos = md_disk_pos; return (disk_pos); } static void nvidia_meta_get_name(struct nvidia_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void nvidia_meta_put_name(struct nvidia_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct nvidia_raid_conf * nvidia_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct nvidia_raid_conf *meta; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - 2 * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct nvidia_raid_conf *)buf; /* Check if this is an NVIDIA RAID struct */ if (strncmp(meta->nvidia_id, NVIDIA_MAGIC, strlen(NVIDIA_MAGIC))) { G_RAID_DEBUG(1, "NVIDIA signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 128 || meta->config_size < 30) { G_RAID_DEBUG(1, "NVIDIA metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "NVIDIA checksum check failed on %s", pp->name); free(meta, M_MD_NVIDIA); return (NULL); } /* Check volume state. */ if (meta->state != NVIDIA_S_IDLE && meta->state != NVIDIA_S_INIT && meta->state != NVIDIA_S_REBUILD && meta->state != NVIDIA_S_SYNC) { G_RAID_DEBUG(1, "NVIDIA unknown state on %s (0x%02x)", pp->name, meta->state); free(meta, M_MD_NVIDIA); return (NULL); } /* Check raid type. */ if (meta->type != NVIDIA_T_RAID0 && meta->type != NVIDIA_T_RAID1 && meta->type != NVIDIA_T_RAID3 && meta->type != NVIDIA_T_RAID5 && meta->type != NVIDIA_T_RAID5_SYM && meta->type != NVIDIA_T_RAID01 && meta->type != NVIDIA_T_CONCAT) { G_RAID_DEBUG(1, "NVIDIA unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_NVIDIA); return (NULL); } return (meta); } static int nvidia_meta_write(struct g_consumer *cp, struct nvidia_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write metadata. */ error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static int nvidia_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static struct g_raid_disk * g_raid_md_nvidia_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_nvidia_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_nvidia_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_nvidia_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd, *oldpd; struct nvidia_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) { disk_pos = pd->pd_meta->disk_number; if (disk_pos >= meta->total_disks || mdi->mdio_started) disk_pos = -3; } else disk_pos = -3; /* For RAID0+1 we need to translate order. */ disk_pos = nvidia_meta_translate_disk(meta, disk_pos); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 2 * 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_nvidia_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else// if (pd->pd_meta->disk_status == NVIDIA_S_CURRENT || //pd->pd_meta->disk_status == NVIDIA_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); // else // g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == NVIDIA_T_CONCAT) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->state == NVIDIA_S_REBUILD && (pd->pd_meta->disk_status & 0x100)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else if (meta->state == NVIDIA_S_SYNC) { /* Resyncing/dirty disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_nvidia_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_NVIDIA); } static void g_raid_md_nvidia_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_NVIDIA, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_nvidia_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_nvidia_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ nvidia_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == NVIDIA_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == NVIDIA_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == NVIDIA_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == NVIDIA_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == NVIDIA_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == NVIDIA_T_RAID5_SYM) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_nvidia_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_nvidia_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct nvidia_raid_conf *pdmeta; struct g_raid_md_nvidia_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_nvidia_start_disk(disk)) g_raid_md_write_nvidia(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || mdi->mdio_meta->disk_number >= mdi->mdio_meta->total_disks) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = nvidia_meta_copy(pdmeta); mdi->mdio_total_disks = pdmeta->total_disks; mdi->mdio_disks_present = 1; } else if (pdmeta->disk_number < mdi->mdio_meta->total_disks) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else G_RAID_DEBUG1(1, sc, "Spare disk"); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_nvidia_start(sc); } } static void g_raid_nvidia_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_nvidia_object *mdi; char name[32]; mdi = (struct g_raid_md_nvidia_object *)md; arc4rand(&mdi->mdio_volume_id, 16, 0); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_nvidia_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct nvidia_raid_conf *meta; struct g_raid_md_nvidia_perdisk *pd; struct g_geom *geom; int result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting NVIDIA on %s", cp->provider->name); mdi = (struct g_raid_md_nvidia_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = nvidia_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x10de) { G_RAID_DEBUG(1, "No NVIDIA metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "NVIDIA vendor mismatch 0x%04x != 0x10de", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ g_raid_md_nvidia_print(meta); G_RAID_DEBUG(1, "NVIDIA disk position %d", meta->disk_number); spare = 0;//(meta->type == NVIDIA_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_nvidia_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (memcmp(&mdi1->mdio_volume_id, &meta->volume_id, 16) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_volume_id, &meta->volume_id, 16); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_nvidia_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-NVIDIA"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_nvidia_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_NVIDIA); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) { /* Bump volume ID to drop missing disks. */ arc4rand(&mdi->mdio_volume_id, 16, 0); g_raid_md_nvidia_start(sc); } return (0); } return (-1); } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } if (mdi->mdio_started) { /* Bump volume ID to prevent disk resurrection. */ if (pd->pd_disk_pos >= 0) arc4rand(&mdi->mdio_volume_id, 16, 0); /* Write updated metadata to all disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); } /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_nvidia(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip, volsize; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_nvidia_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 2 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) volsize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) volsize = size; else if (level == G_RAID_VOLUME_RL_RAID5) volsize = size * (numdisks - 1); else { /* RAID1E */ volsize = ((size * numdisks) / strip / 2) * strip; } if (volsize > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; vol->v_mediasize = volsize; vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) nvidia_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_nvidia(md, NULL, disk); continue; } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ nvidia_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_SPARE && disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_nvidia(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); memcpy(meta->nvidia_id, NVIDIA_MAGIC, sizeof(NVIDIA_MAGIC) - 1); meta->config_size = 30; meta->version = 0x0064; meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->sector_size = vol->v_sectorsize; nvidia_meta_put_name(meta, vol->v_name); meta->magic_0 = NVIDIA_MAGIC0; memcpy(&meta->volume_id, &mdi->mdio_volume_id, 16); meta->state = NVIDIA_S_IDLE; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->array_width = 1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width = vol->v_disks_count / 2; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->array_width = vol->v_disks_count - 1; else meta->array_width = vol->v_disks_count; meta->total_disks = vol->v_disks_count; meta->orig_array_width = meta->array_width; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = NVIDIA_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = NVIDIA_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = NVIDIA_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = NVIDIA_T_CONCAT; else if (vol->v_raid_level_qualifier == G_RAID_VOLUME_RLQ_R5LA) meta->type = NVIDIA_T_RAID5; else meta->type = NVIDIA_T_RAID5_SYM; meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; meta->strip_bytes = vol->v_strip_size; meta->strip_shift = ffs(meta->strip_sectors) - 1; meta->strip_mask = meta->strip_sectors - 1; meta->stripe_sectors = meta->strip_sectors * meta->orig_array_width; meta->stripe_bytes = meta->stripe_sectors * vol->v_sectorsize; meta->rebuild_lba = 0; meta->orig_type = meta->type; meta->orig_total_sectors = meta->total_sectors; meta->status = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC || vol->v_dirty) && meta->state != NVIDIA_S_REBUILD) meta->state = NVIDIA_S_SYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_NEW || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) meta->state = NVIDIA_S_REBUILD; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = meta; spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } pd->pd_meta = nvidia_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { /* For RAID0+1 we need to translate order. */ pd->pd_meta->disk_number = nvidia_meta_translate_disk(meta, sd->sd_pos); if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { pd->pd_meta->disk_status = 0x100; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize * meta->array_width; } } else pd->pd_meta->disk_number = meta->total_disks + spares++; G_RAID_DEBUG(1, "Writing NVIDIA metadata to %s", g_raid_get_diskname(disk)); g_raid_md_nvidia_print(pd->pd_meta); nvidia_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_nvidia(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_nvidia_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_nvidia_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* Erase metadata to prevent disks's later resurrection. */ if (tdisk->d_consumer) nvidia_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } static int g_raid_md_free_disk_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_nvidia_perdisk *pd; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } free(pd, M_MD_NVIDIA); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_nvidia(struct g_raid_md_object *md) { struct g_raid_md_nvidia_object *mdi; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(nvidia, "NVIDIA"); Index: head/sys/geom/raid/md_promise.c =================================================================== --- head/sys/geom/raid/md_promise.c (revision 326269) +++ head/sys/geom/raid/md_promise.c (revision 326270) @@ -1,2006 +1,2008 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata"); #define PROMISE_MAX_DISKS 8 #define PROMISE_MAX_SUBDISKS 2 #define PROMISE_META_OFFSET 14 struct promise_raid_disk { uint8_t flags; /* Subdisk status. */ #define PROMISE_F_VALID 0x01 #define PROMISE_F_ONLINE 0x02 #define PROMISE_F_ASSIGNED 0x04 #define PROMISE_F_SPARE 0x08 #define PROMISE_F_DUPLICATE 0x10 #define PROMISE_F_REDIR 0x20 #define PROMISE_F_DOWN 0x40 #define PROMISE_F_READY 0x80 uint8_t number; /* Position in a volume. */ uint8_t channel; /* ATA channel number. */ uint8_t device; /* ATA device number. */ uint64_t id __packed; /* Subdisk ID. */ } __packed; struct promise_raid_conf { char promise_id[24]; #define PROMISE_MAGIC "Promise Technology, Inc." #define FREEBSD_MAGIC "FreeBSD ATA driver RAID " uint32_t dummy_0; uint64_t magic_0; #define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \ ((uint64_t)(x.device != 0) << 56)) uint16_t magic_1; uint32_t magic_2; uint8_t filler1[470]; uint32_t integrity; #define PROMISE_I_VALID 0x00000080 struct promise_raid_disk disk; /* This subdisk info. */ uint32_t disk_offset; /* Subdisk offset. */ uint32_t disk_sectors; /* Subdisk size */ uint32_t disk_rebuild; /* Rebuild position. */ uint16_t generation; /* Generation number. */ uint8_t status; /* Volume status. */ #define PROMISE_S_VALID 0x01 #define PROMISE_S_ONLINE 0x02 #define PROMISE_S_INITED 0x04 #define PROMISE_S_READY 0x08 #define PROMISE_S_DEGRADED 0x10 #define PROMISE_S_MARKED 0x20 #define PROMISE_S_MIGRATING 0x40 #define PROMISE_S_FUNCTIONAL 0x80 uint8_t type; /* Voluem type. */ #define PROMISE_T_RAID0 0x00 #define PROMISE_T_RAID1 0x01 #define PROMISE_T_RAID3 0x02 #define PROMISE_T_RAID5 0x04 #define PROMISE_T_SPAN 0x08 #define PROMISE_T_JBOD 0x10 uint8_t total_disks; /* Disks in this volume. */ uint8_t stripe_shift; /* Strip size. */ uint8_t array_width; /* Number of RAID0 stripes. */ uint8_t array_number; /* Global volume number. */ uint32_t total_sectors; /* Volume size. */ uint16_t cylinders; /* Volume geometry: C. */ uint8_t heads; /* Volume geometry: H. */ uint8_t sectors; /* Volume geometry: S. */ uint64_t volume_id __packed; /* Volume ID, */ struct promise_raid_disk disks[PROMISE_MAX_DISKS]; /* Subdisks in this volume. */ char name[32]; /* Volume label. */ uint32_t filler2[8]; uint32_t magic_3; /* Something related to rebuild. */ uint64_t rebuild_lba64; /* Per-volume rebuild position. */ uint32_t magic_4; uint32_t magic_5; uint32_t total_sectors_high; uint8_t magic_6; uint8_t sector_size; uint16_t magic_7; uint32_t magic_8[31]; uint32_t backup_time; uint16_t magic_9; uint32_t disk_offset_high; uint32_t disk_sectors_high; uint32_t disk_rebuild_high; uint16_t magic_10; uint32_t magic_11[3]; uint32_t filler3[284]; uint32_t checksum; } __packed; struct g_raid_md_promise_perdisk { int pd_updated; int pd_subdisks; struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS]; }; struct g_raid_md_promise_pervolume { struct promise_raid_conf *pv_meta; uint64_t pv_id; uint16_t pv_generation; int pv_disks_present; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; static g_raid_md_create_t g_raid_md_create_promise; static g_raid_md_taste_t g_raid_md_taste_promise; static g_raid_md_event_t g_raid_md_event_promise; static g_raid_md_volume_event_t g_raid_md_volume_event_promise; static g_raid_md_ctl_t g_raid_md_ctl_promise; static g_raid_md_write_t g_raid_md_write_promise; static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise; static g_raid_md_free_disk_t g_raid_md_free_disk_promise; static g_raid_md_free_volume_t g_raid_md_free_volume_promise; static g_raid_md_free_t g_raid_md_free_promise; static kobj_method_t g_raid_md_promise_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise), KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise), KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise), KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise), { 0, 0 } }; static struct g_raid_md_class g_raid_md_promise_class = { "Promise", g_raid_md_promise_methods, sizeof(struct g_raid_md_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_promise_print(struct promise_raid_conf *meta) { int i; if (g_raid_debug < 1) return; printf("********* ATA Promise Metadata *********\n"); printf("promise_id <%.24s>\n", meta->promise_id); printf("disk %02x %02x %02x %02x %016jx\n", meta->disk.flags, meta->disk.number, meta->disk.channel, meta->disk.device, meta->disk.id); printf("disk_offset %u\n", meta->disk_offset); printf("disk_sectors %u\n", meta->disk_sectors); printf("disk_rebuild %u\n", meta->disk_rebuild); printf("generation %u\n", meta->generation); printf("status 0x%02x\n", meta->status); printf("type %u\n", meta->type); printf("total_disks %u\n", meta->total_disks); printf("stripe_shift %u\n", meta->stripe_shift); printf("array_width %u\n", meta->array_width); printf("array_number %u\n", meta->array_number); printf("total_sectors %u\n", meta->total_sectors); printf("cylinders %u\n", meta->cylinders); printf("heads %u\n", meta->heads); printf("sectors %u\n", meta->sectors); printf("volume_id 0x%016jx\n", meta->volume_id); printf("disks:\n"); for (i = 0; i < PROMISE_MAX_DISKS; i++ ) { printf(" %02x %02x %02x %02x %016jx\n", meta->disks[i].flags, meta->disks[i].number, meta->disks[i].channel, meta->disks[i].device, meta->disks[i].id); } printf("name <%.32s>\n", meta->name); printf("magic_3 0x%08x\n", meta->magic_3); printf("rebuild_lba64 %ju\n", meta->rebuild_lba64); printf("magic_4 0x%08x\n", meta->magic_4); printf("magic_5 0x%08x\n", meta->magic_5); printf("total_sectors_high 0x%08x\n", meta->total_sectors_high); printf("sector_size %u\n", meta->sector_size); printf("backup_time %d\n", meta->backup_time); printf("disk_offset_high 0x%08x\n", meta->disk_offset_high); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high); printf("=================================================\n"); } static struct promise_raid_conf * promise_meta_copy(struct promise_raid_conf *meta) { struct promise_raid_conf *nmeta; nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK); memcpy(nmeta, meta, sizeof(*nmeta)); return (nmeta); } static int promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (meta->disks[pos].id == id) return (pos); } return (-1); } static int promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd, off_t sectors, off_t *off, off_t *size) { off_t coff, csize, tmp; int i, j; sectors -= 131072; *off = 0; *size = 0; coff = 0; csize = sectors; i = 0; while (1) { for (j = 0; j < nsd; j++) { tmp = ((off_t)metaarr[j]->disk_offset_high << 32) + metaarr[j]->disk_offset; if (tmp >= coff) csize = MIN(csize, tmp - coff); } if (csize > *size) { *off = coff; *size = csize; } if (i >= nsd) break; coff = ((off_t)metaarr[i]->disk_offset_high << 32) + metaarr[i]->disk_offset + ((off_t)metaarr[i]->disk_sectors_high << 32) + metaarr[i]->disk_sectors; csize = sectors - coff; i++; } return ((*size > 0) ? 1 : 0); } static int promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos) { int disk_pos, width; if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { width = vol->v_disks_count / 2; disk_pos = (md_disk_pos / width) + (md_disk_pos % width) * width; } else disk_pos = md_disk_pos; return (disk_pos); } static void promise_meta_get_name(struct promise_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 32); buf[32] = 0; for (i = 31; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void promise_meta_put_name(struct promise_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 32); memcpy(meta->name, buf, MIN(strlen(buf), 32)); } static int promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; int error, i, subdisks; uint32_t checksum, *ptr; pp = cp->provider; subdisks = 0; if (pp->sectorsize * 4 > MAXPHYS) { G_RAID_DEBUG(1, "%s: Blocksize is too big.", pp->name); return (subdisks); } next: /* Read metadata block. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisks * PROMISE_META_OFFSET), pp->sectorsize * 4, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (subdisks); } meta = (struct promise_raid_conf *)buf; /* Check if this is an Promise RAID struct */ if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) && strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) { if (subdisks == 0) G_RAID_DEBUG(1, "Promise signature check failed on %s", pp->name); g_free(buf); return (subdisks); } meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK); memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if ((meta->integrity & PROMISE_I_VALID) == 0) { G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if (meta->total_disks > PROMISE_MAX_DISKS) { G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)", pp->name, meta->total_disks); free(meta, M_MD_PROMISE); return (subdisks); } /* Remove filler garbage from fields used in newer metadata. */ if (meta->disk_offset_high == 0x8b8c8d8e && meta->disk_sectors_high == 0x8788898a && meta->disk_rebuild_high == 0x83848586) { meta->disk_offset_high = 0; meta->disk_sectors_high = 0; if (meta->disk_rebuild == UINT32_MAX) meta->disk_rebuild_high = UINT32_MAX; else meta->disk_rebuild_high = 0; if (meta->total_sectors_high == 0x15161718) { meta->total_sectors_high = 0; meta->backup_time = 0; if (meta->rebuild_lba64 == 0x2122232425262728) meta->rebuild_lba64 = UINT64_MAX; } } if (meta->sector_size < 1 || meta->sector_size > 8) meta->sector_size = 1; /* Save this part and look for next. */ *metaarr = meta; metaarr++; subdisks++; if (subdisks < PROMISE_MAX_SUBDISKS) goto next; return (subdisks); } static int promise_meta_write(struct g_consumer *cp, struct promise_raid_conf **metaarr, int nsd) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; off_t off, size; int error, i, subdisk, fake; uint32_t checksum, *ptr; pp = cp->provider; subdisk = 0; fake = 0; next: buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO); meta = NULL; if (subdisk < nsd) { meta = metaarr[subdisk]; } else if (!fake && promise_meta_unused_range(metaarr, nsd, cp->provider->mediasize / cp->provider->sectorsize, &off, &size)) { /* Optionally add record for unused space. */ meta = (struct promise_raid_conf *)buf; memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); meta->disk_offset_high = off >> 32; meta->disk_offset = (uint32_t)off; meta->disk_sectors_high = size >> 32; meta->disk_sectors = (uint32_t)size; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; fake = 1; } if (meta != NULL) { /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; meta->checksum = checksum; memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta))); } error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, pp->sectorsize * 4); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_PROMISE); subdisk++; if (subdisk < PROMISE_MAX_SUBDISKS) goto next; return (error); } static int promise_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, subdisk; pp = cp->provider; buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO); for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) { error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, 4 * pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_PROMISE); return (error); } static int promise_meta_write_spare(struct g_consumer *cp) { struct promise_raid_conf *meta; off_t tmp; int error; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072; meta->disk_sectors_high = tmp >> 32; meta->disk_sectors = (uint32_t)tmp; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; error = promise_meta_write(cp, &meta, 1); free(meta, M_MD_PROMISE); return (error); } static struct g_raid_volume * g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id) { struct g_raid_volume *vol; struct g_raid_md_promise_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (pv->pv_id == id) break; } return (vol); } static int g_raid_md_promise_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_promise_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_promise_purge_disks(struct g_raid_softc *sc) { struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_promise_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_promise_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_PROMISE); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); } static int g_raid_md_promise_supported(int level, int qual, int disks, int force) { if (disks > PROMISE_MAX_DISKS) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t eoff, esize, size; int disk_pos, md_disk_pos, i, resurrection = 0; sc = disk->d_softc; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; pv = vol->v_md_data; meta = pv->pv_meta; if (sdn >= 0) { /* Find disk position in metadata by its serial. */ md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id); /* For RAID0+1 we need to translate order. */ disk_pos = promise_meta_translate_disk(vol, md_disk_pos); } else { md_disk_pos = -1; disk_pos = -1; } if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if (sdn >= 0 && pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we were given specific metadata subdisk - erase it. */ if (sdn >= 0) { free(pd->pd_meta[sdn], M_MD_PROMISE); for (i = sdn; i < pd->pd_subdisks - 1; i++) pd->pd_meta[i] = pd->pd_meta[i + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; } /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, disk->d_consumer->provider->mediasize / disk->d_consumer->provider->sectorsize, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && (off_t)esize * 512 < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), (off_t)esize * 512, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size / 512; /* For RAID0+1 we need to translate order. */ md_disk_pos = promise_meta_translate_disk(vol, disk_pos); } else { nofit: if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); } return (0); } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = (off_t)eoff * 512; sd->sd_size = (off_t)esize * 512; } else { sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high << 32) + pd->pd_meta[sdn]->disk_offset) * 512; sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high << 32) + pd->pd_meta[sdn]->disk_sectors) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta[sdn]->generation != meta->generation) sd->sd_rebuild_pos = 0; else { sd->sd_rebuild_pos = (((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) + pd->pd_meta[sdn]->disk_rebuild) * 512; } } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta[sdn]->generation != meta->generation || (meta->status & PROMISE_S_MARKED)) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_promise_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) { update = g_raid_md_promise_start_disk(disk, -1, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_promise(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_promise_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; u_int i; sc = vol->v_softc; md = sc->sc_md; pv = vol->v_md_data; meta = pv->pv_meta; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == PROMISE_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (meta->type == PROMISE_T_RAID1) { if (meta->array_width == 1) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (meta->type == PROMISE_T_RAID3) vol->v_raid_level = G_RAID_VOLUME_RL_RAID3; else if (meta->type == PROMISE_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else if (meta->type == PROMISE_T_SPAN) vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; else if (meta->type == PROMISE_T_JBOD) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ vol->v_disks_count = meta->total_disks; vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ if (meta->total_sectors_high < 256) /* If value looks sane. */ vol->v_mediasize += ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ vol->v_sectorsize = 512 * meta->sector_size; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; sd->sd_offset = (((off_t)meta->disk_offset_high << 32) + meta->disk_offset) * 512; sd->sd_size = (((off_t)meta->disk_sectors_high << 32) + meta->disk_sectors) * 512; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i]->volume_id == meta->volume_id) g_raid_md_promise_start_disk(disk, i, vol); } } pv->pv_started = 1; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_promise_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_promise_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_promise_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct promise_raid_conf *pdmeta; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_raid_volume *vol; int i; char buf[33]; sc = disk->d_softc; md = sc->sc_md; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_promise_refill(sc); return; } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) { promise_meta_get_name(pdmeta, buf); vol = g_raid_create_volume(sc, buf, pdmeta->array_number); pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); pv->pv_id = pdmeta->volume_id; vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_promise_go, vol); } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ if (pv->pv_meta == NULL || !pv->pv_started) { if (pv->pv_meta == NULL || ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = promise_meta_copy(pdmeta); pv->pv_generation = pv->pv_meta->generation; pv->pv_disks_present = 1; } else if (pdmeta->generation == pv->pv_generation) { pv->pv_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", pv->pv_disks_present, pv->pv_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } } } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) continue; pv = vol->v_md_data; if (pv->pv_started) { if (g_raid_md_promise_start_disk(disk, i, vol)) g_raid_md_write_promise(md, vol, NULL, NULL); } else { /* If we collected all needed disks - start array. */ if (pv->pv_disks_present == pv->pv_meta->total_disks) g_raid_md_promise_start(vol); } } } static int g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ sc = g_raid_create_node(mp, "Promise", md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct promise_raid_conf *meta, *metaarr[4]; struct g_raid_md_promise_perdisk *pd; struct g_geom *geom; int i, j, result, len, subdisks; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name); pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); subdisks = promise_meta_read(cp, metaarr); g_topology_lock(); if (subdisks == 0) { if (g_raid_aggressive_spare) { if (vendor == 0x105a || vendor == 0x1002) { G_RAID_DEBUG(1, "No Promise metadata, forcing spare."); goto search; } else { G_RAID_DEBUG(1, "Promise/ATI vendor mismatch " "0x%04x != 0x105a/0x1002", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ for (i = 0; i < subdisks; i++) g_raid_md_promise_print(metaarr[i]); /* Purge meaningless (empty/spare) records. */ for (i = 0; i < subdisks; ) { if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) { i++; continue; } free(metaarr[i], M_MD_PROMISE); for (j = i; j < subdisks - 1; j++) metaarr[i] = metaarr[j + 1]; metaarr[subdisks - 1] = NULL; subdisks--; } search: /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; snprintf(name, sizeof(name), "Promise"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); pd->pd_subdisks = subdisks; for (i = 0; i < subdisks; i++) pd->pd_meta[i] = metaarr[i]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_promise_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_promise(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_promise(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_promise_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_promise(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS]; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual; int error; sc = md->mdo_softc; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_promise_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) { gctl_error(req, "Disk '%s' already " "used by %d volumes.", diskname, pd->pd_subdisks); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, pp->mediasize / pp->sectorsize, &offs[i], &esize); size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0); pv->pv_generation = 0; pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = (off_t)offs[i] * 512; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_promise_purge_disks(sc); g_raid_md_write_promise(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_promise(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); promise_meta_write_spare(cp); g_raid_md_promise_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t rebuild_lba64; int i, j, pos, rebuild; sc = md->mdo_softc; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Generate new per-volume metadata for affected volumes. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; /* Skip volumes not related to specified targets. */ if (tvol != NULL && vol != tvol) continue; if (tsd != NULL && vol != tsd->sd_volume) continue; if (tdisk != NULL) { for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_disk == tdisk) break; } if (i >= vol->v_disks_count) continue; } pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; pv->pv_generation++; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); if (pv->pv_meta != NULL) memcpy(meta, pv->pv_meta, sizeof(*meta)); memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->generation = pv->pv_generation; meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE | PROMISE_S_INITED | PROMISE_S_READY; if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) meta->status |= PROMISE_S_DEGRADED; if (vol->v_dirty) meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = PROMISE_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = PROMISE_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) meta->type = PROMISE_T_RAID3; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->type = PROMISE_T_RAID5; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) meta->type = PROMISE_T_SPAN; else meta->type = PROMISE_T_JBOD; meta->total_disks = vol->v_disks_count; meta->stripe_shift = ffs(vol->v_strip_size / 1024); meta->array_width = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width /= 2; meta->array_number = vol->v_global_id; meta->total_sectors = vol->v_mediasize / 512; meta->total_sectors_high = (vol->v_mediasize / 512) >> 32; meta->sector_size = vol->v_sectorsize / 512; meta->cylinders = meta->total_sectors / (255 * 63) - 1; meta->heads = 254; meta->sectors = 63; meta->volume_id = pv->pv_id; rebuild_lba64 = UINT64_MAX; rebuild = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); meta->disks[pos].flags = PROMISE_F_VALID | PROMISE_F_ASSIGNED; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) { meta->disks[pos].flags |= 0; } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) { meta->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) { meta->disks[pos].flags |= PROMISE_F_ONLINE | PROMISE_F_REDIR; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; rebuild = 1; } else { meta->disks[pos].flags |= PROMISE_F_ONLINE; if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) { meta->status |= PROMISE_S_MARKED; if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; } } if (pv->pv_meta != NULL) { meta->disks[pos].id = pv->pv_meta->disks[pos].id; } else { meta->disks[pos].number = i * 2; arc4rand(&meta->disks[pos].id, sizeof(meta->disks[pos].id), 0); } } promise_meta_put_name(meta, vol->v_name); /* Try to mimic AMD BIOS rebuild/resync behavior. */ if (rebuild_lba64 != UINT64_MAX) { if (rebuild) meta->magic_3 = 0x03040010UL; /* Rebuild? */ else meta->magic_3 = 0x03040008UL; /* Resync? */ /* Translate from per-disk to per-volume LBA. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { rebuild_lba64 *= meta->array_width; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) { rebuild_lba64 *= meta->array_width - 1; } else rebuild_lba64 = 0; } else meta->magic_3 = 0x03000000UL; meta->rebuild_lba64 = rebuild_lba64; meta->magic_4 = 0x04010101UL; /* Replace per-volume metadata with new. */ if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = meta; /* Copy new metadata to the disks, adding or replacing old. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; disk = sd->sd_disk; if (disk == NULL) continue; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (j = 0; j < pd->pd_subdisks; j++) { if (pd->pd_meta[j]->volume_id == meta->volume_id) break; } if (j == pd->pd_subdisks) pd->pd_subdisks++; if (pd->pd_meta[j] != NULL) free(pd->pd_meta[j], M_MD_PROMISE); pd->pd_meta[j] = promise_meta_copy(meta); pd->pd_meta[j]->disk = meta->disks[pos]; pd->pd_meta[j]->disk.number = pos; pd->pd_meta[j]->disk_offset_high = (sd->sd_offset / 512) >> 32; pd->pd_meta[j]->disk_offset = sd->sd_offset / 512; pd->pd_meta[j]->disk_sectors_high = (sd->sd_size / 512) >> 32; pd->pd_meta[j]->disk_sectors = sd->sd_size / 512; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = (sd->sd_rebuild_pos / 512) >> 32; pd->pd_meta[j]->disk_rebuild = sd->sd_rebuild_pos / 512; } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = 0; pd->pd_meta[j]->disk_rebuild = 0; } else { pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX; pd->pd_meta[j]->disk_rebuild = UINT32_MAX; } pd->pd_updated = 1; } } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (!pd->pd_updated) continue; G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(disk)); for (i = 0; i < pd->pd_subdisks; i++) g_raid_md_promise_print(pd->pd_meta[i]); promise_meta_write(disk->d_consumer, pd->pd_meta, pd->pd_subdisks); pd->pd_updated = 0; } return (0); } static int g_raid_md_fail_disk_promise(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_promise_perdisk *pd; struct g_raid_subdisk *sd; int i, pos; sc = md->mdo_softc; pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL) G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(tdisk)); for (i = 0; i < pd->pd_subdisks; i++) { pd->pd_meta[i]->disk.flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; pos = pd->pd_meta[i]->disk.number; if (pos >= 0 && pos < PROMISE_MAX_DISKS) { pd->pd_meta[i]->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } g_raid_md_promise_print(pd->pd_meta[i]); } if (tdisk->d_consumer != NULL) promise_meta_write(tdisk->d_consumer, pd->pd_meta, pd->pd_subdisks); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, tdisk); g_raid_md_promise_refill(sc); return (0); } static int g_raid_md_free_disk_promise(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_promise_perdisk *pd; int i; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i] != NULL) { free(pd->pd_meta[i], M_MD_PROMISE); pd->pd_meta[i] = NULL; } } free(pd, M_MD_PROMISE); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_promise(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; if (pv && pv->pv_meta != NULL) { free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = NULL; } if (pv && !pv->pv_started) { pv->pv_started = 1; callout_stop(&pv->pv_start_co); } free(pv, M_MD_PROMISE); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_promise(struct g_raid_md_object *md) { return (0); } G_RAID_MD_DECLARE(promise, "Promise"); Index: head/sys/geom/raid/md_sii.c =================================================================== --- head/sys/geom/raid/md_sii.c (revision 326269) +++ head/sys/geom/raid/md_sii.c (revision 326270) @@ -1,1671 +1,1673 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata"); struct sii_raid_conf { uint16_t ata_params_00_53[54]; uint64_t total_sectors; /* 54 - 57 */ uint16_t ata_params_58_81[72]; uint16_t product_id; /* 130 */ uint16_t vendor_id; /* 131 */ uint16_t version_minor; /* 132 */ uint16_t version_major; /* 133 */ uint8_t timestamp[6]; /* 134 - 136 */ uint16_t strip_sectors; /* 137 */ uint16_t dummy_2; uint8_t disk_number; /* 139 */ uint8_t type; #define SII_T_RAID0 0x00 #define SII_T_RAID1 0x01 #define SII_T_RAID01 0x02 #define SII_T_SPARE 0x03 #define SII_T_CONCAT 0x04 #define SII_T_RAID5 0x10 #define SII_T_RESERVED 0xfd #define SII_T_JBOD 0xff uint8_t raid0_disks; /* 140 */ uint8_t raid0_ident; uint8_t raid1_disks; /* 141 */ uint8_t raid1_ident; uint64_t rebuild_lba; /* 142 - 145 */ uint32_t generation; /* 146 - 147 */ uint8_t disk_status; /* 148 */ #define SII_S_CURRENT 0x01 #define SII_S_REBUILD 0x02 #define SII_S_DROPPED 0x03 #define SII_S_REMOVED 0x04 uint8_t raid_status; #define SII_S_ONLINE 0x01 #define SII_S_AVAILABLE 0x02 uint8_t raid_location; /* 149 */ uint8_t disk_location; uint8_t auto_rebuild; /* 150 */ #define SII_R_REBUILD 0x00 #define SII_R_NOREBUILD 0xff uint8_t dummy_3; uint8_t name[16]; /* 151 - 158 */ uint16_t checksum; /* 159 */ uint16_t ata_params_160_255[96]; } __packed; struct g_raid_md_sii_perdisk { struct sii_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_sii_object { struct g_raid_md_object mdio_base; uint8_t mdio_timestamp[6]; uint8_t mdio_location; uint32_t mdio_generation; struct sii_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_sii; static g_raid_md_taste_t g_raid_md_taste_sii; static g_raid_md_event_t g_raid_md_event_sii; static g_raid_md_ctl_t g_raid_md_ctl_sii; static g_raid_md_write_t g_raid_md_write_sii; static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii; static g_raid_md_free_disk_t g_raid_md_free_disk_sii; static g_raid_md_free_t g_raid_md_free_sii; static kobj_method_t g_raid_md_sii_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_sii), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_sii), KOBJMETHOD(g_raid_md_event, g_raid_md_event_sii), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_sii), KOBJMETHOD(g_raid_md_write, g_raid_md_write_sii), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_sii), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_sii), KOBJMETHOD(g_raid_md_free, g_raid_md_free_sii), { 0, 0 } }; static struct g_raid_md_class g_raid_md_sii_class = { "SiI", g_raid_md_sii_methods, sizeof(struct g_raid_md_sii_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_sii_print(struct sii_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA SiI RAID Metadata *********\n"); printf("total_sectors %llu\n", (long long unsigned)meta->total_sectors); printf("product_id 0x%04x\n", meta->product_id); printf("vendor_id 0x%04x\n", meta->vendor_id); printf("version_minor 0x%04x\n", meta->version_minor); printf("version_major 0x%04x\n", meta->version_major); printf("timestamp 0x%02x%02x%02x%02x%02x%02x\n", meta->timestamp[5], meta->timestamp[4], meta->timestamp[3], meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]); printf("strip_sectors %d\n", meta->strip_sectors); printf("disk_number %d\n", meta->disk_number); printf("type 0x%02x\n", meta->type); printf("raid0_disks %d\n", meta->raid0_disks); printf("raid0_ident %d\n", meta->raid0_ident); printf("raid1_disks %d\n", meta->raid1_disks); printf("raid1_ident %d\n", meta->raid1_ident); printf("rebuild_lba %llu\n", (long long unsigned)meta->rebuild_lba); printf("generation %d\n", meta->generation); printf("disk_status %d\n", meta->disk_status); printf("raid_status %d\n", meta->raid_status); printf("raid_location %d\n", meta->raid_location); printf("disk_location %d\n", meta->disk_location); printf("auto_rebuild %d\n", meta->auto_rebuild); printf("name <%.16s>\n", meta->name); printf("checksum 0x%04x\n", meta->checksum); printf("=================================================\n"); } static struct sii_raid_conf * sii_meta_copy(struct sii_raid_conf *meta) { struct sii_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int sii_meta_total_disks(struct sii_raid_conf *meta) { switch (meta->type) { case SII_T_RAID0: case SII_T_RAID5: case SII_T_CONCAT: return (meta->raid0_disks); case SII_T_RAID1: return (meta->raid1_disks); case SII_T_RAID01: return (meta->raid0_disks * meta->raid1_disks); case SII_T_SPARE: case SII_T_JBOD: return (1); } return (0); } static int sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta) { if (pdmeta->type == SII_T_SPARE) return (-3); if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0) return (-1); switch (pdmeta->type) { case SII_T_RAID0: case SII_T_RAID1: case SII_T_RAID5: case SII_T_CONCAT: return (pdmeta->disk_number); case SII_T_RAID01: return (pdmeta->raid1_ident * pdmeta->raid1_disks + pdmeta->raid0_ident); case SII_T_JBOD: return (0); } return (-1); } static void sii_meta_get_name(struct sii_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void sii_meta_put_name(struct sii_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct sii_raid_conf * sii_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct sii_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct sii_raid_conf *)buf; /* Check vendor ID. */ if (meta->vendor_id != 0x1095) { G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)", pp->name, meta->vendor_id); g_free(buf); return (NULL); } /* Check metadata major version. */ if (meta->version_major != 2) { G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)", pp->name, meta->version_major, meta->version_minor); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name); free(meta, M_MD_SII); return (NULL); } /* Check raid type. */ if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 && meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE && meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT && meta->type != SII_T_JBOD) { G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_SII); return (NULL); } return (meta); } static int sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); break; } } free(buf, M_MD_SII); return (error); } static int sii_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, i; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_SII); return (error); } static int sii_meta_write_spare(struct g_consumer *cp) { struct sii_raid_conf *meta; int error; meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); meta->total_sectors = cp->provider->mediasize / cp->provider->sectorsize - 0x800; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; meta->timestamp[0] = arc4random(); meta->timestamp[1] = arc4random(); meta->timestamp[2] = arc4random(); meta->timestamp[3] = arc4random(); meta->timestamp[4] = arc4random(); meta->timestamp[5] = arc4random(); meta->type = SII_T_SPARE; meta->generation = 1; meta->raid1_ident = 0xff; meta->raid_location = arc4random(); error = sii_meta_write(cp, meta); free(meta, M_MD_SII); return (error); } static struct g_raid_disk * g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_sii_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_sii_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_sii_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd, *oldpd; struct sii_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = sii_meta_disk_pos(meta, pd->pd_meta); else disk_pos = -3; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_sii_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (pd->pd_meta->disk_status == SII_S_CURRENT || pd->pd_meta->disk_status == SII_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->disk_status == SII_S_REBUILD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta->generation == meta->generation) sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512; else sd->sd_rebuild_pos = 0; } else if (pd->pd_meta->disk_status == SII_S_CURRENT) { if (pd->pd_meta->raid_status == SII_S_ONLINE || pd->pd_meta->generation != meta->generation) { /* Dirty or resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_sii_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_SII); } static void g_raid_md_sii_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_SII, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_sii_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_sii_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *best; off_t size; int j, disk_pos; uint32_t gendiff, bestgendiff; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ sii_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == SII_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == SII_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == SII_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == SII_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == SII_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == SII_T_JBOD) { vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; size = 0; } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* * Make all disks found till the moment take their places * in order of their generation numbers. */ do { best = NULL; bestgendiff = 0xffffffff; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_RAID_DISK_S_NONE) continue; pd = disk->d_md_data; if (pd->pd_meta == NULL) gendiff = 0xfffffffe; else gendiff = meta->generation - pd->pd_meta->generation; if (gendiff < bestgendiff) { best = disk; bestgendiff = gendiff; } } if (best != NULL) g_raid_md_sii_start_disk(best); } while (best != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_sii_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct sii_raid_conf *pdmeta; struct g_raid_md_sii_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_sii_start_disk(disk)) g_raid_md_write_sii(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = sii_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_total_disks = sii_meta_total_disks(pdmeta); mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_sii_start(sc); } } static void g_raid_sii_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_sii_object *mdi; char name[32]; mdi = (struct g_raid_md_sii_object *)md; mdi->mdio_timestamp[5] = arc4random(); mdi->mdio_timestamp[4] = arc4random(); mdi->mdio_timestamp[3] = arc4random(); mdi->mdio_timestamp[2] = arc4random(); mdi->mdio_timestamp[1] = arc4random(); mdi->mdio_timestamp[0] = arc4random(); mdi->mdio_location = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_sii_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct sii_raid_conf *meta; struct g_raid_md_sii_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name); mdi = (struct g_raid_md_sii_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = sii_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x1095) { G_RAID_DEBUG(1, "No SiI metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "SiI vendor mismatch 0x%04x != 0x1095", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = sii_meta_disk_pos(meta, meta); if (disk_pos == -1) { G_RAID_DEBUG(1, "SiI disk position not found"); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_sii_print(meta); G_RAID_DEBUG(1, "SiI disk position %d", disk_pos); spare = (meta->type == SII_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_sii_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_location == meta->raid_location && memcmp(&mdi1->mdio_timestamp, &meta->timestamp, 6) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6); mdi->mdio_location = meta->raid_location; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_sii_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-SiI"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_sii_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_SII); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_sii(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_sii_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_sii(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_sii_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 0x800 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) sii_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_sii(md, NULL, disk); continue; } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ sii_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { sii_meta_write_spare(cp); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; u_int i; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6); meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) { meta->type = SII_T_RAID0; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { meta->type = SII_T_RAID1; meta->raid0_disks = 0xff; meta->raid1_disks = vol->v_disks_count; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { meta->type = SII_T_RAID01; meta->raid0_disks = vol->v_disks_count / 2; meta->raid1_disks = 2; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) { meta->type = SII_T_JBOD; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else { meta->type = SII_T_RAID5; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } meta->generation = mdi->mdio_generation; meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) meta->raid_status = SII_S_ONLINE; } meta->raid_location = mdi->mdio_location; sii_meta_put_name(meta, vol->v_name); /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } pd->pd_meta = sii_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { if (sd->sd_state < G_RAID_SUBDISK_S_NEW) pd->pd_meta->disk_status = SII_S_DROPPED; else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) { pd->pd_meta->disk_status = SII_S_REBUILD; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize; } else pd->pd_meta->disk_status = SII_S_CURRENT; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0xff; pd->pd_meta->raid1_ident = 0; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks; pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks; pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks; } else { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0; pd->pd_meta->raid1_ident = 0xff; } } G_RAID_DEBUG(1, "Writing SiI metadata to %s", g_raid_get_diskname(disk)); g_raid_md_sii_print(pd->pd_meta); sii_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_sii(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_sii_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (tdisk->d_consumer) { if (pd->pd_meta) { pd->pd_meta->disk_status = SII_S_REMOVED; sii_meta_write(tdisk->d_consumer, pd->pd_meta); } else sii_meta_erase(tdisk->d_consumer); } /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } static int g_raid_md_free_disk_sii(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_sii_perdisk *pd; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } free(pd, M_MD_SII); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_sii(struct g_raid_md_object *md) { struct g_raid_md_sii_object *mdi; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(sii, "SiI"); Index: head/sys/geom/raid/tr_concat.c =================================================================== --- head/sys/geom/raid/tr_concat.c (revision 326269) +++ head/sys/geom/raid/tr_concat.c (revision 326270) @@ -1,353 +1,355 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data"); struct g_raid_tr_concat_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopped; }; static g_raid_tr_taste_t g_raid_tr_taste_concat; static g_raid_tr_event_t g_raid_tr_event_concat; static g_raid_tr_start_t g_raid_tr_start_concat; static g_raid_tr_stop_t g_raid_tr_stop_concat; static g_raid_tr_iostart_t g_raid_tr_iostart_concat; static g_raid_tr_iodone_t g_raid_tr_iodone_concat; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat; static g_raid_tr_free_t g_raid_tr_free_concat; static kobj_method_t g_raid_tr_concat_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_concat), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_concat), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_concat), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_concat), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_concat), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_concat), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_concat), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_concat), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_concat_class = { "CONCAT", g_raid_tr_concat_methods, sizeof(struct g_raid_tr_concat_object), .trc_enable = 1, .trc_priority = 50, .trc_accept_unmapped = 1 }; static int g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume) { struct g_raid_tr_concat_object *trs; trs = (struct g_raid_tr_concat_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT && !(tr->tro_volume->v_disks_count == 1 && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_concat(struct g_raid_volume *vol) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; off_t size; u_int s; int i, n, f; sc = vol->v_softc; trs = (struct g_raid_tr_concat_object *)vol->v_tr; if (trs->trso_stopped) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); if (n + f == vol->v_disks_count) { if (f == 0) s = G_RAID_VOLUME_S_OPTIMAL; else s = G_RAID_VOLUME_S_SUBOPTIMAL; } else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { /* * Some metadata modules may not know CONCAT volume * mediasize until all disks connected. Recalculate. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT && G_RAID_VOLUME_S_ALIVE(s) && !G_RAID_VOLUME_S_ALIVE(vol->v_state)) { size = 0; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) size += vol->v_subdisks[i].sd_size; } vol->v_mediasize = size; } g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; int state; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; sc = vol->v_softc; state = sd->sd_state; if (state != G_RAID_SUBDISK_S_NONE && state != G_RAID_SUBDISK_S_FAILED && state != G_RAID_SUBDISK_S_ACTIVE) { G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } if (state != sd->sd_state && !trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, sd, NULL); g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_start_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_stop_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopped = 1; g_raid_tr_update_state_concat(vol); return (0); } static void g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, length, remain; u_int no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } if (bp->bio_cmd == BIO_FLUSH) { g_raid_tr_flush_common(tr, bp); return; } offset = bp->bio_offset; remain = bp->bio_length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", bp->bio_offset)); bioq_init(&queue); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %ju)", bp->bio_offset, bp->bio_length)); } while (remain > 0); while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static int g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; char *addr; off_t offset, length, remain; int error, no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) return (ENXIO); offset = boffset; remain = blength; addr = virtual; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", boffset)); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], addr, 0, offset, length); if (error != 0) return (error); remain -= length; addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %zu)", boffset, blength)); } while (remain > 0); return (0); } static void g_raid_tr_iodone_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd,struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, bp->bio_error); } } static int g_raid_tr_free_concat(struct g_raid_tr_object *tr) { return (0); } G_RAID_TR_DECLARE(concat, "CONCAT"); Index: head/sys/geom/raid/tr_raid0.c =================================================================== --- head/sys/geom/raid/tr_raid0.c (revision 326269) +++ head/sys/geom/raid/tr_raid0.c (revision 326270) @@ -1,335 +1,337 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data"); struct g_raid_tr_raid0_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopped; }; static g_raid_tr_taste_t g_raid_tr_taste_raid0; static g_raid_tr_event_t g_raid_tr_event_raid0; static g_raid_tr_start_t g_raid_tr_start_raid0; static g_raid_tr_stop_t g_raid_tr_stop_raid0; static g_raid_tr_iostart_t g_raid_tr_iostart_raid0; static g_raid_tr_iodone_t g_raid_tr_iodone_raid0; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0; static g_raid_tr_free_t g_raid_tr_free_raid0; static kobj_method_t g_raid_tr_raid0_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid0), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid0), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid0), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid0), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid0), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid0), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid0), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid0), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid0_class = { "RAID0", g_raid_tr_raid0_methods, sizeof(struct g_raid_tr_raid0_object), .trc_enable = 1, .trc_priority = 100, .trc_accept_unmapped = 1 }; static int g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume) { struct g_raid_tr_raid0_object *trs; trs = (struct g_raid_tr_raid0_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid0(struct g_raid_volume *vol) { struct g_raid_tr_raid0_object *trs; struct g_raid_softc *sc; u_int s; int n, f; sc = vol->v_softc; trs = (struct g_raid_tr_raid0_object *)vol->v_tr; if (trs->trso_stopped) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); if (n + f == vol->v_disks_count) { if (f == 0) s = G_RAID_VOLUME_S_OPTIMAL; else s = G_RAID_VOLUME_S_SUBOPTIMAL; } else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_raid0(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { struct g_raid_tr_raid0_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; int state; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; sc = vol->v_softc; state = sd->sd_state; if (state != G_RAID_SUBDISK_S_NONE && state != G_RAID_SUBDISK_S_FAILED && state != G_RAID_SUBDISK_S_ACTIVE) { G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } if (state != sd->sd_state && !trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, sd, NULL); g_raid_tr_update_state_raid0(vol); return (0); } static int g_raid_tr_start_raid0(struct g_raid_tr_object *tr) { struct g_raid_tr_raid0_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid0(vol); return (0); } static int g_raid_tr_stop_raid0(struct g_raid_tr_object *tr) { struct g_raid_tr_raid0_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopped = 1; g_raid_tr_update_state_raid0(vol); return (0); } static void g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, nstripe, remain; u_int no, strip_size; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } if (bp->bio_cmd == BIO_FLUSH) { g_raid_tr_flush_common(tr, bp); return; } if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; /* Stripe number. */ nstripe = bp->bio_offset / strip_size; /* Start position in stripe. */ start = bp->bio_offset % strip_size; /* Disk number. */ no = nstripe % vol->v_disks_count; /* Stripe start position in disk. */ offset = (nstripe / vol->v_disks_count) * strip_size; /* Length of data to operate. */ remain = bp->bio_length; bioq_init(&queue); do { length = MIN(strip_size - start, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; start = 0; } while (remain > 0); while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static int g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; char *addr; off_t offset, start, length, nstripe, remain; u_int no, strip_size; int error; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) return (ENXIO); addr = virtual; strip_size = vol->v_strip_size; /* Stripe number. */ nstripe = boffset / strip_size; /* Start position in stripe. */ start = boffset % strip_size; /* Disk number. */ no = nstripe % vol->v_disks_count; /* Stripe tart position in disk. */ offset = (nstripe / vol->v_disks_count) * strip_size; /* Length of data to operate. */ remain = blength; do { length = MIN(strip_size - start, remain); error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], addr, 0, offset + start, length); if (error != 0) return (error); if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } remain -= length; addr += length; start = 0; } while (remain > 0); return (0); } static void g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd,struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, bp->bio_error); } } static int g_raid_tr_free_raid0(struct g_raid_tr_object *tr) { return (0); } G_RAID_TR_DECLARE(raid0, "RAID0"); Index: head/sys/geom/raid/tr_raid1.c =================================================================== --- head/sys/geom/raid/tr_raid1.c (revision 326269) +++ head/sys/geom/raid/tr_raid1.c (revision 326270) @@ -1,984 +1,986 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" SYSCTL_DECL(_kern_geom_raid_raid1); #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1_REBUILD_CLUSTER_IDLE 100 static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); #define TR_RAID1_NONE 0 #define TR_RAID1_REBUILD 1 #define TR_RAID1_RESYNC 2 #define TR_RAID1_F_DOING_SOME 0x1 #define TR_RAID1_F_LOCKED 0x2 #define TR_RAID1_F_ABORT 0x4 struct g_raid_tr_raid1_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1; static g_raid_tr_event_t g_raid_tr_event_raid1; static g_raid_tr_start_t g_raid_tr_start_raid1; static g_raid_tr_stop_t g_raid_tr_stop_raid1; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; static g_raid_tr_locked_t g_raid_tr_locked_raid1; static g_raid_tr_idle_t g_raid_tr_idle_raid1; static g_raid_tr_free_t g_raid_tr_free_raid1; static kobj_method_t g_raid_tr_raid1_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1_class = { "RAID1", g_raid_tr_raid1_methods, sizeof(struct g_raid_tr_raid1_object), .trc_enable = 1, .trc_priority = 100, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1_object *trs; struct g_raid_softc *sc; struct g_raid_subdisk *tsd, *bestsd; u_int s; int i, na, ns; sc = vol->v_softc; trs = (struct g_raid_tr_raid1_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { /* Make sure we have at least one ACTIVE disk. */ na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); if (na == 0) { /* * Critical situation! We have no any active disk! * Choose the best disk we have to make it active. */ bestsd = &vol->v_subdisks[0]; for (i = 1; i < vol->v_disks_count; i++) { tsd = &vol->v_subdisks[i]; if (tsd->sd_state > bestsd->sd_state) bestsd = tsd; else if (tsd->sd_state == bestsd->sd_state && (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = tsd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } } na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); if (na == vol->v_disks_count) s = G_RAID_VOLUME_S_OPTIMAL; else if (na + ns == vol->v_disks_count) s = G_RAID_VOLUME_S_SUBOPTIMAL; else if (na > 0) s = G_RAID_VOLUME_S_DEGRADED; else s = G_RAID_VOLUME_S_BROKEN; g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static void g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *good_sd; struct bio *bp; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) return; sd = trs->trso_failed_sd; good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); if (good_sd == NULL) { g_raid_tr_raid1_rebuild_abort(tr); return; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = sd->sd_rebuild_pos; bp->bio_length = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = good_sd; trs->trso_flags |= TR_RAID1_F_DOING_SOME; trs->trso_flags |= TR_RAID1_F_LOCKED; g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ bp->bio_offset, bp->bio_length, NULL, bp); } static void g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; trs->trso_type = TR_RAID1_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1(vol, NULL); } static void g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1_rebuild_done(trs); } static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; off_t len; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1_F_ABORT; if (trs->trso_flags & TR_RAID1_F_LOCKED) { trs->trso_flags &= ~TR_RAID1_F_LOCKED; len = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); g_raid_unlock_range(tr->tro_volume, sd->sd_rebuild_pos, len); } g_raid_tr_raid1_rebuild_done(trs); } } static void g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *fsd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No active disk to rebuild. night night."); return; } fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (fsd == NULL) { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } else { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } } } if (fsd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = fsd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", fsd->sd_volume->v_name, fsd->sd_pos, fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1_REBUILD; trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); trs->trso_meta_update = g_raid1_rebuild_meta_update; g_raid_tr_raid1_rebuild_some(tr); } static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; int na, nr; /* * If we're stopping, don't do anything. If we don't have at least one * good disk and one bad disk, we don't do anything. And if there's a * 'good disk' stored in the trs, then we're in progress and we punt. * If we make it past all these checks, we need to rebuild. */ vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_stopping) return; na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1_NONE: if (na == 0) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1_rebuild_start(tr); break; case TR_RAID1_REBUILD: if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1_rebuild_abort(tr); break; case TR_RAID1_RESYNC: break; } } static int g_raid_tr_event_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1(vol, NULL); return (0); } static int g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static struct g_raid_subdisk * g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, u_int mask) { struct g_raid_subdisk *sd, *best; int i, prio, bestprio; best = NULL; bestprio = INT_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) continue; if ((mask & (1 << i)) != 0) continue; prio = G_RAID_SUBDISK_LOAD(sd); prio += min(sd->sd_recovery, 255) << 22; prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { best = sd; bestprio = prio; } } return (best); } static void g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_subdisk *sd; struct bio *cbp; sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); KASSERT(sd != NULL, ("No active disks in volume %s.", tr->tro_volume->v_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { g_raid_iodone(bp, ENOMEM); return; } g_raid_subdisk_iostart(sd, cbp); } static void g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (bp->bio_offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1_rebuild_fair_io; g_raid_tr_raid1_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1_write(tr, bp); break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1_object *trs; uintptr_t *mask; int error, do_write; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { /* * This operation is part of a rebuild or resync operation. * See what work just got done, then schedule the next bit of * work, if any. Rebuild/resync is done a little bit at a * time. Either when a timeout happens, or after we get a * bunch of I/Os to the disk (to make sure an active system * will complete in a sane amount of time). * * We are setup to do differing amounts of work for each of * these cases. so long as the slabs is smallish (less than * 50 or so, I'd guess, but that's just a WAG), we shouldn't * have any bio starvation issues. For active disks, we do * 5MB of data, for inactive ones, we do 50MB. */ if (trs->trso_type == TR_RAID1_REBUILD) { if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1_F_ABORT) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(4, bp, "rebuild read done. %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(trs->trso_failed_sd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(4, bp, "rebuild write done. Error %d", bp->bio_error); nsd = trs->trso_failed_sd; if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1_F_ABORT) { if ((trs->trso_flags & TR_RAID1_F_ABORT) == 0) { g_raid_tr_raid1_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } rebuild_round_done: nsd = trs->trso_failed_sd; trs->trso_flags &= ~TR_RAID1_F_LOCKED; g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1_rebuild_meta_update; } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; g_raid_tr_raid1_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 1; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); if (pbp->bio_children == 1) do_write = 0; } /* * Find the other disk, and try to do the I/O to it. */ mask = (uintptr_t *)(&pbp->bio_driver2); if (pbp->bio_children == 1) { /* Save original subdisk. */ pbp->bio_driver1 = do_write ? sd : NULL; *mask = 0; } *mask |= 1 << sd->sd_pos; nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (pbp->bio_children == 2 && do_write) { sd->sd_recovery++; cbp->bio_caller1 = nsd; pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, cbp->bio_offset, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && pbp->bio_children > 1 && pbp->bio_driver1 != NULL) { /* * If it was a read, and bio_children is >1, then we just * recovered the data from the second drive. We should try to * write that data to the first drive if sector remapping is * enabled. A write should put the data in a new place on the * disk, remapping the bad sector. Do we need to do that by * queueing a request to the main worker thread? It doesn't * affect the return code of this current read, and can be * done at our leisure. However, to make the code simpler, it * is done synchronously. */ G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); cbp = g_clone_bio(pbp); if (cbp != NULL) { g_destroy_bio(bp); cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(pbp->bio_driver1, cbp); return; } } if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } if (pbp->bio_driver1 != NULL) { ((struct g_raid_subdisk *)pbp->bio_driver1) ->sd_recovery--; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; int error, i, ok; vol = tr->tro_volume; error = 0; ok = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } error = g_raid_subdisk_kerneldump(sd, virtual, physical, offset, length); if (error == 0) ok++; } return (ok > 0 ? 0 : error); } static int g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; trs->trso_fair_io = g_raid1_rebuild_fair_io; trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; if (trs->trso_type == TR_RAID1_REBUILD) g_raid_tr_raid1_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1, "RAID1"); Index: head/sys/geom/raid/tr_raid1e.c =================================================================== --- head/sys/geom/raid/tr_raid1e.c (revision 326269) +++ head/sys/geom/raid/tr_raid1e.c (revision 326270) @@ -1,1242 +1,1244 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" #define N 2 SYSCTL_DECL(_kern_geom_raid_raid1e); #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, &g_raid1e_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, &g_raid1e_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1E_REBUILD_CLUSTER_IDLE 100 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, &g_raid1e_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, &g_raid1e_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); #define TR_RAID1E_NONE 0 #define TR_RAID1E_REBUILD 1 #define TR_RAID1E_RESYNC 2 #define TR_RAID1E_F_DOING_SOME 0x1 #define TR_RAID1E_F_LOCKED 0x2 #define TR_RAID1E_F_ABORT 0x4 struct g_raid_tr_raid1e_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ off_t trso_lock_pos; /* Locked range start. */ off_t trso_lock_len; /* Locked range length. */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1e; static g_raid_tr_event_t g_raid_tr_event_raid1e; static g_raid_tr_start_t g_raid_tr_start_raid1e; static g_raid_tr_stop_t g_raid_tr_stop_raid1e; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; static g_raid_tr_locked_t g_raid_tr_locked_raid1e; static g_raid_tr_idle_t g_raid_tr_idle_raid1e; static g_raid_tr_free_t g_raid_tr_free_raid1e; static kobj_method_t g_raid_tr_raid1e_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1e_class = { "RAID1E", g_raid_tr_raid1e_methods, sizeof(struct g_raid_tr_raid1e_object), .trc_enable = 1, .trc_priority = 200, .trc_accept_unmapped = 1 }; static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask); static inline void V2P(struct g_raid_volume *vol, off_t virt, int *disk, off_t *offset, off_t *start) { off_t nstrip; u_int strip_size; strip_size = vol->v_strip_size; /* Strip number. */ nstrip = virt / strip_size; /* Start position in strip. */ *start = virt % strip_size; /* Disk number. */ *disk = (nstrip * N) % vol->v_disks_count; /* Strip start position in disk. */ *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; } static inline void P2V(struct g_raid_volume *vol, int disk, off_t offset, off_t *virt, int *copy) { off_t nstrip, start; u_int strip_size; strip_size = vol->v_strip_size; /* Start position in strip. */ start = offset % strip_size; /* Physical strip number. */ nstrip = (offset / strip_size) * vol->v_disks_count + disk; /* Number of physical strip (copy) inside virtual strip. */ *copy = nstrip % N; /* Offset in virtual space. */ *virt = (nstrip / N) * strip_size + start; } static int g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count / N; i++) { bestsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } worstsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == vol->v_disks_count) return (G_RAID_VOLUME_S_OPTIMAL); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to STALE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); g_raid_write_metadata(sc, vol, sd, sd->sd_disk); } } state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count; i++) { bestsd = &vol->v_subdisks[i]; worstsd = &vol->v_subdisks[i]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; u_int s; sc = vol->v_softc; trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { if ((vol->v_disks_count % N) == 0) s = g_raid_tr_update_state_raid1e_even(vol); else s = g_raid_tr_update_state_raid1e_odd(vol); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } if (!trs->trso_starting && !trs->trso_stopping) g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); return (0); } static void g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { struct g_raid_volume *vol; vol = sd->sd_volume; /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < vol->v_disks_count) && (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; trs->trso_type = TR_RAID1E_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1e(vol, NULL); } static void g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1e_rebuild_done(trs); } static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1E_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1E_F_ABORT; if (trs->trso_flags & TR_RAID1E_F_LOCKED) { trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); } g_raid_tr_raid1e_rebuild_done(trs); } } static void g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio *bp; off_t len, virtual, vend, offset, start; int disk, copy, best; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) return; vol = tr->tro_volume; sc = vol->v_softc; sd = trs->trso_failed_sd; while (1) { if (sd->sd_rebuild_pos >= sd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Get virtual offset from physical rebuild position. */ P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); /* Get physical offset back to get first stripe position. */ V2P(vol, virtual, &disk, &offset, &start); /* Calculate contignous data length. */ len = MIN(g_raid1e_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); if ((vol->v_disks_count % N) != 0) len = MIN(len, vol->v_strip_size - start); /* Find disk with most accurate data. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset + start, len, 0); if (best < 0) { /* There is no any valid disk. */ g_raid_tr_raid1e_rebuild_abort(tr); return; } else if (best != copy) { /* Some other disk has better data. */ break; } /* We have the most accurate data. Skip the range. */ G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); sd->sd_rebuild_pos += len; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = offset + start + ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); bp->bio_length = len; bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); /* * If we are crossing stripe boundary, correct affected virtual * range we should lock. */ if (start + len > vol->v_strip_size) { P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); len = vend - virtual; } trs->trso_flags |= TR_RAID1E_F_DOING_SOME; trs->trso_flags |= TR_RAID1E_F_LOCKED; trs->trso_lock_pos = virtual; trs->trso_lock_len = len; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); } static void g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (sd == NULL) { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } else { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } } } if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = sd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1E_REBUILD; trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); trs->trso_meta_update = g_raid1e_rebuild_meta_update; g_raid_tr_raid1e_rebuild_some(tr); } static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; int nr; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_stopping) return; nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1E_NONE: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1e_rebuild_start(tr); break; case TR_RAID1E_REBUILD: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1e_rebuild_abort(tr); break; case TR_RAID1E_RESYNC: break; } } static int g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1e(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } static int g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask) { struct g_raid_subdisk *sd; off_t offset; int i, best, prio, bestprio; best = -1; bestprio = INT_MAX; for (i = 0; i < N; i++) { sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; offset = off; if (no + i >= vol->v_disks_count) offset += vol->v_strip_size; prio = G_RAID_SUBDISK_LOAD(sd); if ((mask & (1 << sd->sd_pos)) != 0) continue; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_RESYNC: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ case G_RAID_SUBDISK_S_STALE: prio += i << 24; break; case G_RAID_SUBDISK_S_REBUILD: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ default: continue; } prio += min(sd->sd_recovery, 255) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { bestprio = prio; best = i; } } return (best); } static void g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int best; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); best = g_raid_tr_raid1e_select_read_disk(vol, no, offset, length, 0); KASSERT(best >= 0, ("No readable disk in volume %s!", vol->v_name)); no += best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); no += N - best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } remain -= length; addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i; vol = tr->tro_volume; if ((bp->bio_flags & BIO_UNMAPPED) != 0) addr = NULL; else addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0 && bp->bio_cmd != BIO_DELETE) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; if (bp->bio_cmd != BIO_DELETE) addr += length; start = 0; } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1e_rebuild_fair_io; g_raid_tr_raid1e_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1e_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: g_raid_tr_iostart_raid1e_write(tr, bp); break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1e_object *trs; off_t virtual, offset, start; uintptr_t mask; int error, do_write, copy, disk, best; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { if (trs->trso_type == TR_RAID1E_REBUILD) { nsd = trs->trso_failed_sd; if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1E_F_ABORT) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_offset = nsd->sd_rebuild_pos; G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(nsd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", bp->bio_error); if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1E_F_ABORT) { if ((trs->trso_flags & TR_RAID1E_F_ABORT) == 0) { g_raid_tr_raid1e_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } rebuild_round_done: trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1e_rebuild_meta_update; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_meta_update *= g_raid1e_rebuild_slab; trs->trso_meta_update /= vol->v_strip_size; } } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; /* Run next rebuild iteration. */ g_raid_tr_raid1e_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1E_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; mask = (intptr_t)bp->bio_caller2; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 0; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); else if (mask == 0) do_write = 1; /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find the other disk, and try to do the I/O to it. */ mask |= 1 << copy; best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_length = bp->bio_length; cbp->bio_data = bp->bio_data; cbp->bio_ma = bp->bio_ma; cbp->bio_ma_offset = bp->bio_ma_offset; cbp->bio_ma_n = bp->bio_ma_n; g_destroy_bio(bp); nsd = &vol->v_subdisks[disk]; G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (do_write) mask |= 1 << 31; if ((mask & (1U << 31)) != 0) sd->sd_recovery++; cbp->bio_caller2 = (void *)mask; if (do_write) { cbp->bio_caller1 = nsd; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && (mask & (1U << 31)) != 0) { G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find best disk to write. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, ~mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; cbp->bio_caller2 = (void *)mask; g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); return; } } if ((mask & (1U << 31)) != 0) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we aggressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); for (copy = 0; copy < N; copy++) { if ((mask & (1 << copy) ) != 0) vol->v_subdisks[(disk + copy) % vol->v_disks_count].sd_recovery--; } if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i, error; vol = tr->tro_volume; addr = virtual; strip_size = vol->v_strip_size; V2P(vol, boffset, &no, &offset, &start); remain = blength; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } error = g_raid_subdisk_kerneldump(sd, addr, 0, offset + start, length); if (error != 0) return (error); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; addr += length; start = 0; } return (0); } static int g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; trs->trso_fair_io = g_raid1e_rebuild_fair_io; trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_recover_slabs *= g_raid1e_rebuild_slab; trs->trso_recover_slabs /= vol->v_strip_size; } if (trs->trso_type == TR_RAID1E_REBUILD) g_raid_tr_raid1e_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1e, "RAID1E"); Index: head/sys/geom/raid/tr_raid5.c =================================================================== --- head/sys/geom/raid/tr_raid5.c (revision 326269) +++ head/sys/geom/raid/tr_raid5.c (revision 326270) @@ -1,421 +1,423 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_RAID5, "tr_raid5_data", "GEOM_RAID RAID5 data"); #define TR_RAID5_NONE 0 #define TR_RAID5_REBUILD 1 #define TR_RAID5_RESYNC 2 #define TR_RAID5_F_DOING_SOME 0x1 #define TR_RAID5_F_LOCKED 0x2 #define TR_RAID5_F_ABORT 0x4 struct g_raid_tr_raid5_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid5; static g_raid_tr_event_t g_raid_tr_event_raid5; static g_raid_tr_start_t g_raid_tr_start_raid5; static g_raid_tr_stop_t g_raid_tr_stop_raid5; static g_raid_tr_iostart_t g_raid_tr_iostart_raid5; static g_raid_tr_iodone_t g_raid_tr_iodone_raid5; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid5; static g_raid_tr_locked_t g_raid_tr_locked_raid5; static g_raid_tr_free_t g_raid_tr_free_raid5; static kobj_method_t g_raid_tr_raid5_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid5), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid5), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid5), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid5), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid5), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid5), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid5), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid5), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid5), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid5_class = { "RAID5", g_raid_tr_raid5_methods, sizeof(struct g_raid_tr_raid5_object), .trc_enable = 1, .trc_priority = 100 }; static int g_raid_tr_taste_raid5(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid5_object *trs; u_int qual; trs = (struct g_raid_tr_raid5_object *)tr; qual = tr->tro_volume->v_raid_level_qualifier; if (tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID4 && (qual == G_RAID_VOLUME_RLQ_R4P0 || qual == G_RAID_VOLUME_RLQ_R4PN)) { /* RAID4 */ } else if ((tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5 || tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5E || tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5EE || tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5R || tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID6 || tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAIDMDF) && (qual == G_RAID_VOLUME_RLQ_R5RA || qual == G_RAID_VOLUME_RLQ_R5RS || qual == G_RAID_VOLUME_RLQ_R5LA || qual == G_RAID_VOLUME_RLQ_R5LS)) { /* RAID5/5E/5EE/5R/6/MDF */ } else return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid5(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid5_object *trs; struct g_raid_softc *sc; u_int s; int na, ns, nu; sc = vol->v_softc; trs = (struct g_raid_tr_raid5_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID5_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); nu = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (na == vol->v_disks_count) s = G_RAID_VOLUME_S_OPTIMAL; else if (na + ns == vol->v_disks_count || na + ns + nu == vol->v_disks_count /* XXX: Temporary. */) s = G_RAID_VOLUME_S_SUBOPTIMAL; else if (na == vol->v_disks_count - 1 || na + ns + nu == vol->v_disks_count) s = G_RAID_VOLUME_S_DEGRADED; else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_raid5(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid5(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid5(struct g_raid_tr_object *tr) { struct g_raid_tr_raid5_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid5_object *)tr; trs->trso_starting = 0; vol = tr->tro_volume; vol->v_read_only = 1; g_raid_tr_update_state_raid5(vol, NULL); return (0); } static int g_raid_tr_stop_raid5(struct g_raid_tr_object *tr) { struct g_raid_tr_raid5_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid5_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid5(vol, NULL); return (0); } static void g_raid_tr_iostart_raid5_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, nstripe, remain; int no, pno, ddisks, pdisks, protate, pleft; u_int strip_size, lvl, qual; vol = tr->tro_volume; addr = bp->bio_data; strip_size = vol->v_strip_size; lvl = tr->tro_volume->v_raid_level; qual = tr->tro_volume->v_raid_level_qualifier; protate = tr->tro_volume->v_rotate_parity; /* Stripe number. */ nstripe = bp->bio_offset / strip_size; /* Start position in stripe. */ start = bp->bio_offset % strip_size; /* Number of data and parity disks. */ if (lvl == G_RAID_VOLUME_RL_RAIDMDF) pdisks = tr->tro_volume->v_mdf_pdisks; else if (lvl == G_RAID_VOLUME_RL_RAID5EE || lvl == G_RAID_VOLUME_RL_RAID6) pdisks = 2; else pdisks = 1; ddisks = vol->v_disks_count - pdisks; /* Parity disk number. */ if (lvl == G_RAID_VOLUME_RL_RAID4) { if (qual == 0) /* P0 */ pno = 0; else /* PN */ pno = ddisks; pleft = -1; } else { pno = (nstripe / (ddisks * protate)) % vol->v_disks_count; pleft = protate - (nstripe / ddisks) % protate; if (qual >= 2) { /* PN/Left */ pno = ddisks - pno; if (pno < 0) pno += vol->v_disks_count; } } /* Data disk number. */ no = nstripe % ddisks; if (lvl == G_RAID_VOLUME_RL_RAID4) { if (qual == 0) no += pdisks; } else if (qual & 1) { /* Continuation/Symmetric */ no = (pno + pdisks + no) % vol->v_disks_count; } else if (no >= pno) /* Restart/Asymmetric */ no += pdisks; else no += imax(0, pno + pdisks - vol->v_disks_count); /* Stripe start position in disk. */ offset = (nstripe / ddisks) * strip_size; /* Length of data to operate. */ remain = bp->bio_length; bioq_init(&queue); do { length = MIN(strip_size - start, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_data = addr; cbp->bio_length = length; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); no++; if (lvl == G_RAID_VOLUME_RL_RAID4) { no %= vol->v_disks_count; if (no == pno) no = (no + pdisks) % vol->v_disks_count; } else if (qual & 1) { /* Continuation/Symmetric */ no %= vol->v_disks_count; if (no == pno) { if ((--pleft) <= 0) { pleft += protate; if (qual < 2) /* P0/Right */ pno++; else /* PN/Left */ pno += vol->v_disks_count - 1; pno %= vol->v_disks_count; } no = (pno + pdisks) % vol->v_disks_count; offset += strip_size; } } else { /* Restart/Asymmetric */ if (no == pno) no += pdisks; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; if ((--pleft) <= 0) { pleft += protate; if (qual < 2) /* P0/Right */ pno++; else /* PN/Left */ pno += vol->v_disks_count - 1; pno %= vol->v_disks_count; } if (no == pno) no += pdisks; else no += imax(0, pno + pdisks - vol->v_disks_count); offset += strip_size; } } remain -= length; addr += length; start = 0; } while (remain > 0); while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid5(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid5_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid5_object *)tr; if (vol->v_state < G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid5_read(tr, bp); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: g_raid_iodone(bp, ENODEV); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid5(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *pbp; int error; pbp = bp->bio_parent; pbp->bio_inbed++; error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid5(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { return (ENODEV); } static int g_raid_tr_locked_raid5(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_free_raid5(struct g_raid_tr_object *tr) { struct g_raid_tr_raid5_object *trs; trs = (struct g_raid_tr_raid5_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID5); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid5, "RAID5"); Index: head/sys/geom/raid3/g_raid3.c =================================================================== --- head/sys/geom/raid3/g_raid3.c (revision 326269) +++ head/sys/geom/raid3/g_raid3.c (revision 326270) @@ -1,3583 +1,3585 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_raid3, "GEOM RAID-3 functionality"); static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); u_int g_raid3_debug = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0, "Debug level"); static u_int g_raid3_timeout = 4; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout, 0, "Time to wait on all raid3 components"); static u_int g_raid3_idletime = 5; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_raid3_idletime, 0, "Mark components as clean when idling"); static u_int g_raid3_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid3_syncreqs = 2; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_raid3_use_malloc = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN, &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9)."); static u_int g_raid3_n64k = 50; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0, "Maximum number of 64kB allocations"); static u_int g_raid3_n16k = 200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0, "Maximum number of 16kB allocations"); static u_int g_raid3_n4k = 1200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0, "Maximum number of 4kB allocations"); static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, "GEOM_RAID3 statistics"); static u_int g_raid3_parity_mismatch = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_raid3_post_sync = NULL; static int g_raid3_shutdown = 0; static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid3_taste; static void g_raid3_init(struct g_class *mp); static void g_raid3_fini(struct g_class *mp); struct g_class g_raid3_class = { .name = G_RAID3_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid3_config, .taste = g_raid3_taste, .destroy_geom = g_raid3_destroy_geom, .init = g_raid3_init, .fini = g_raid3_fini }; static void g_raid3_destroy_provider(struct g_raid3_softc *sc); static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); static int g_raid3_register_request(struct bio *pbp); static void g_raid3_sync_release(struct g_raid3_softc *sc); static const char * g_raid3_disk_state2str(int state) { switch (state) { case G_RAID3_DISK_STATE_NODISK: return ("NODISK"); case G_RAID3_DISK_STATE_NONE: return ("NONE"); case G_RAID3_DISK_STATE_NEW: return ("NEW"); case G_RAID3_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_RAID3_DISK_STATE_STALE: return ("STALE"); case G_RAID3_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_RAID3_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid3_device_state2str(int state) { switch (state) { case G_RAID3_DEVICE_STATE_STARTING: return ("STARTING"); case G_RAID3_DEVICE_STATE_DEGRADED: return ("DEGRADED"); case G_RAID3_DEVICE_STATE_COMPLETE: return ("COMPLETE"); default: return ("INVALID"); } } const char * g_raid3_get_diskname(struct g_raid3_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } static void * g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags) { void *ptr; enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) ptr = malloc(size, M_RAID3, flags); else { ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone, &sc->sc_zones[zone], flags); sc->sc_zones[zone].sz_requested++; if (ptr == NULL) sc->sc_zones[zone].sz_failed++; } return (ptr); } static void g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size) { enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) free(ptr, M_RAID3); else { uma_zfree_arg(sc->sc_zones[zone].sz_zone, ptr, &sc->sc_zones[zone]); } } static int g_raid3_uma_ctor(void *mem, int size, void *arg, int flags) { struct g_raid3_zone *sz = arg; if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max) return (ENOMEM); sz->sz_inuse++; return (0); } static void g_raid3_uma_dtor(void *mem, int size, void *arg) { struct g_raid3_zone *sz = arg; sz->sz_inuse--; } #define g_raid3_xor(src, dst, size) \ _g_raid3_xor((uint64_t *)(src), \ (uint64_t *)(dst), (size_t)size) static void _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size) { KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); for (; size > 0; size -= 128) { *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); } } static int g_raid3_is_zero(struct bio *bp) { static const uint64_t zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; u_char *addr; ssize_t size; size = bp->bio_length; addr = (u_char *)bp->bio_data; for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { if (bcmp(addr, zeros, sizeof(zeros)) != 0) return (0); } return (1); } /* * --- Events handling functions --- * Events in geom_raid3 are used to maintain disks and device status * from one thread to simplify locking. */ static void g_raid3_event_free(struct g_raid3_event *ep) { free(ep, M_RAID3); } int g_raid3_event_send(void *arg, int state, int flags) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID3_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", hz * 5); } error = ep->e_error; g_raid3_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_raid3_event * g_raid3_event_get(struct g_raid3_softc *sc) { struct g_raid3_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_raid3_event_cancel(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in the given state. * If state is equal to -1, count all connected disks. */ u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state) { struct g_raid3_disk *disk; u_int n, ndisks; sx_assert(&sc->sc_lock, SX_LOCKED); for (n = ndisks = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (state == -1 || disk->d_state == state) ndisks++; } return (ndisks); } static u_int g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID3_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid3_nrequests(sc, cp) > 0) { G_RAID3_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid3_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid3_is_busy(sc, cp)) return; G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); g_topology_unlock(); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); return (0); } static void g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_raid3_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_raid3_disk * g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md, int *errorp) { struct g_raid3_disk *disk; int error; disk = &sc->sc_disks[md->md_no]; error = g_raid3_connect_disk(disk, pp); if (error != 0) { if (errorp != NULL) *errorp = error; return (NULL); } disk->d_state = G_RAID3_DISK_STATE_NONE; disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); } static void g_raid3_destroy_disk(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); if (disk->d_state == G_RAID3_DISK_STATE_NODISK) return; g_raid3_event_cancel(disk); switch (disk->d_state) { case G_RAID3_DISK_STATE_SYNCHRONIZING: if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); /* FALLTHROUGH */ case G_RAID3_DISK_STATE_NEW: case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_ACTIVE: g_topology_lock(); g_raid3_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } disk->d_state = G_RAID3_DISK_STATE_NODISK; } static void g_raid3_destroy_device(struct g_raid3_softc *sc) { struct g_raid3_event *ep; struct g_raid3_disk *disk; struct g_geom *gp; struct g_consumer *cp; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); g_raid3_destroy_disk(disk); } } while ((ep = g_raid3_event_get(sc)) != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); g_topology_lock(); if (cp != NULL) g_raid3_disconnect_consumer(sc, cp); g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_raid3_orphan(struct g_consumer *cp) { struct g_raid3_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } static int g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); if (md != NULL) raid3_metadata_encode(md, sector); error = g_write_data(cp, offset, sector, length); free(sector, M_RAID3); if (error != 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { G_RAID3_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; } else { G_RAID3_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } return (error); } int g_raid3_clear_metadata(struct g_raid3_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_raid3_write_metadata(disk, NULL); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s cleared.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } return (error); } void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_provider *pp; sc = disk->d_softc; strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); md->md_version = G_RAID3_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_id = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); md->md_no = disk->d_no; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = 0; else { md->md_sync_offset = disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1); } if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) pp = disk->d_consumer->provider; else pp = NULL; if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); else bzero(md->md_provider, sizeof(md->md_provider)); if (pp != NULL) md->md_provsize = pp->mediasize; else md->md_provsize = 0; } void g_raid3_update_metadata(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_raid3_fill_metadata(disk, &md); error = g_raid3_write_metadata(disk, &md); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s updated.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } } static void g_raid3_bump_syncid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_raid3_update_metadata(disk); } } } static void g_raid3_bump_genid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_raid3_update_metadata(disk); } } } static int g_raid3_idle(struct g_raid3_softc *sc, int acw) { struct g_raid3_disk *disk; u_int i; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); if (!g_raid3_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } return (0); } static void g_raid3_unidle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } } /* * Treat bio_driver1 field in parent bio as list head and field bio_caller1 * in child bio as pointer to the next element on the list. */ #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 #define G_RAID3_FOREACH_BIO(pbp, bp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ (bp) = G_RAID3_NEXT_BIO(bp)) #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); \ (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ (bp) = (tmpbp)) static void g_raid3_init_bio(struct bio *pbp) { G_RAID3_HEAD_BIO(pbp) = NULL; } static void g_raid3_remove_bio(struct bio *cbp) { struct bio *pbp, *bp; pbp = cbp->bio_parent; if (G_RAID3_HEAD_BIO(pbp) == cbp) G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) { G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); break; } } } G_RAID3_NEXT_BIO(cbp) = NULL; } static void g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) { struct bio *pbp, *bp; g_raid3_remove_bio(sbp); pbp = dbp->bio_parent; G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); if (G_RAID3_HEAD_BIO(pbp) == dbp) G_RAID3_HEAD_BIO(pbp) = sbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == dbp) { G_RAID3_NEXT_BIO(bp) = sbp; break; } } } G_RAID3_NEXT_BIO(dbp) = NULL; } static void g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) { struct bio *bp, *pbp; size_t size; pbp = cbp->bio_parent; pbp->bio_children--; KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); size = pbp->bio_length / (sc->sc_ndisks - 1); g_raid3_free(sc, cbp->bio_data, size); if (G_RAID3_HEAD_BIO(pbp) == cbp) { G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; g_destroy_bio(cbp); } else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) break; } if (bp != NULL) { KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; } g_destroy_bio(cbp); } } static struct bio * g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) { struct bio *bp, *cbp; size_t size; int memflag; cbp = g_clone_bio(pbp); if (cbp == NULL) return (NULL); size = pbp->bio_length / (sc->sc_ndisks - 1); if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) memflag = M_WAITOK; else memflag = M_NOWAIT; cbp->bio_data = g_raid3_alloc(sc, size, memflag); if (cbp->bio_data == NULL) { pbp->bio_children--; g_destroy_bio(cbp); return (NULL); } G_RAID3_NEXT_BIO(cbp) = NULL; if (G_RAID3_HEAD_BIO(pbp) == NULL) G_RAID3_HEAD_BIO(pbp) = cbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == NULL) { G_RAID3_NEXT_BIO(bp) = cbp; break; } } } return (cbp); } static void g_raid3_scatter(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *bp, *cbp, *tmpbp; off_t atom, cadd, padd, left; int first; sc = pbp->bio_to->geom->softc; bp = NULL; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Find bio for which we should calculate data. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { bp = cbp; break; } } KASSERT(bp != NULL, ("NULL parity bio.")); } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { if (cbp == bp) continue; bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); padd += atom; } cadd += atom; } if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Calculate parity. */ first = 1; G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { if (cbp == bp) continue; if (first) { bcopy(cbp->bio_data, bp->bio_data, bp->bio_length); first = 0; } else { g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_length); } if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) g_raid3_destroy_bio(sc, cbp); } } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { struct g_consumer *cp; disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } } static void g_raid3_gather(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *xbp, *fbp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; /* * Find bio for which we have to calculate data. * While going through this path, check if all requests * succeeded, if not, deny whole request. * If we're in COMPLETE mode, we allow one request to fail, * so if we find one, we're sending it to the parity consumer. * If there are more failed requests, we deny whole request. */ xbp = fbp = NULL; G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { KASSERT(xbp == NULL, ("More than one parity bio.")); xbp = cbp; } if (cbp->bio_error == 0) continue; /* * Found failed request. */ if (fbp == NULL) { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { /* * We are already in degraded mode, so we can't * accept any failures. */ if (pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; } else { fbp = cbp; } } else { /* * Next failed request, that's too many. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } disk = cbp->bio_caller2; if (disk == NULL) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } if (pbp->bio_error != 0) goto finish; if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; if (xbp != fbp) g_raid3_replace_bio(xbp, fbp); g_raid3_destroy_bio(sc, fbp); } else if (fbp != NULL) { struct g_consumer *cp; /* * One request failed, so send the same request to * the parity consumer. */ disk = pbp->bio_driver2; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { pbp->bio_error = fbp->bio_error; goto finish; } pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_inbed--; fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); if (disk->d_no == sc->sc_ndisks - 1) fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; fbp->bio_error = 0; fbp->bio_completed = 0; fbp->bio_children = 0; fbp->bio_inbed = 0; cp = disk->d_consumer; fbp->bio_caller2 = disk; fbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(fbp, cp); return; } if (xbp != NULL) { /* * Calculate parity. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) continue; g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_length); } xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { if (!g_raid3_is_zero(xbp)) { g_raid3_parity_mismatch++; pbp->bio_error = EIO; goto finish; } g_raid3_destroy_bio(sc, xbp); } } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); pbp->bio_completed += atom; padd += atom; } cadd += atom; } finish: if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) G_RAID3_LOGREQ(1, pbp, "Verification error."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); } pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); g_io_deliver(pbp, pbp->bio_error); } static void g_raid3_done(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_regular_request(struct bio *cbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = cbp->bio_parent; sc = pbp->bio_to->geom->softc; cbp->bio_from->index--; if (cbp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = cbp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, cbp->bio_from); g_topology_unlock(); } G_RAID3_LOGREQ(3, cbp, "Request finished."); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (pbp->bio_inbed != pbp->bio_children) return; switch (pbp->bio_cmd) { case BIO_READ: g_raid3_gather(pbp); break; case BIO_WRITE: case BIO_DELETE: { int error = 0; pbp->bio_completed = pbp->bio_length; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { if (cbp->bio_error == 0) { g_raid3_destroy_bio(sc, cbp); continue; } if (error == 0) error = cbp->bio_error; else if (pbp->bio_error == 0) { /* * Next failed request, that's too many. */ pbp->bio_error = error; } disk = cbp->bio_caller2; if (disk == NULL) { g_raid3_destroy_bio(sc, cbp); continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } g_raid3_destroy_bio(sc, cbp); } if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_raid3_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; } } } static void g_raid3_sync_done(struct bio *bp) { struct g_raid3_softc *sc; G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp; u_int i; bioq_init(&queue); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_std_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_RAID3_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_raid3_start(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid3_start() should not be called at all. */ KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_RAID3_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_raid3_flush(sc, bp); return; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) { struct g_raid3_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; int i; disk = sc->sc_syncdisk; if (disk == NULL) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; for (i = 0; i < g_raid3_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_length; if (sbp->bio_cmd == BIO_WRITE) { sstart *= sc->sc_ndisks - 1; send *= sc->sc_ndisks - 1; } send += sstart; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_syncdisk == NULL) return (0); sstart = sbp->bio_offset; send = sstart + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_raid3_regular_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_raid3_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_raid3_sync_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_raid3_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * being synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_raid3_sync_request(struct bio *bp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_RAID3); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; u_char *dst, *src; off_t left; u_int atom; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); dst = src = bp->bio_data; if (disk->d_no == sc->sc_ndisks - 1) { u_int n; /* Parity component. */ for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += atom; for (n = 1; n < sc->sc_ndisks - 1; n++) { g_raid3_xor(src, dst, atom); src += atom; } dst += atom; } } else { /* Regular component. */ src += atom * disk->d_no; for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += sc->sc_sectorsize; dst += atom; } } bp->bio_driver1 = bp->bio_driver2 = NULL; bp->bio_pflags = 0; bp->bio_offset /= sc->sc_ndisks - 1; bp->bio_length /= sc->sc_ndisks - 1; bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; bp->bio_children = bp->bio_inbed = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_raid3_disk_sync *sync; off_t boffset, moffset; void *data; int i; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_RAID3); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { return; } /* * Disk up-to-date, activate it. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, G_RAID3_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_raid3_regular_release(sc); /* Find the smallest offset. */ moffset = sc->sc_mediasize; for (i = 0; i < g_raid3_syncreqs; i++) { bp = sync->ds_bios[i]; boffset = bp->bio_offset; if (bp->bio_cmd == BIO_WRITE) boffset *= sc->sc_ndisks - 1; if (boffset < moffset) moffset = boffset; } if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = moffset; g_raid3_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_raid3_register_request(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp, *tmpbp; off_t offset, length; u_int n, ndisks; int round_robin, verify; ndisks = 0; sc = pbp->bio_to->geom->softc; if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && sc->sc_syncdisk == NULL) { g_io_deliver(pbp, EIO); return (0); } g_raid3_init_bio(pbp); length = pbp->bio_length / (sc->sc_ndisks - 1); offset = pbp->bio_offset / (sc->sc_ndisks - 1); round_robin = verify = 0; switch (pbp->bio_cmd) { case BIO_READ: if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; verify = 1; ndisks = sc->sc_ndisks; } else { verify = 0; ndisks = sc->sc_ndisks - 1; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { round_robin = 1; } else { round_robin = 0; } KASSERT(!round_robin || !verify, ("ROUND-ROBIN and VERIFY are mutually exclusive.")); pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; break; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_raid3_sync_collision(sc, pbp)) { g_raid3_regular_delay(sc, pbp); return (0); } if (sc->sc_idle) g_raid3_unidle(sc); else sc->sc_last_write = time_uptime; ndisks = sc->sc_ndisks; break; } for (n = 0; n < ndisks; n++) { disk = &sc->sc_disks[n]; cbp = g_raid3_clone_bio(sc, pbp); if (cbp == NULL) { while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); /* * To prevent deadlock, we must run back up * with the ENOMEM for failed requests of any * of our consumers. Our own sync requests * can stick around, as they are finite. */ if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_io_deliver(pbp, ENOMEM); return (0); } return (ENOMEM); } cbp->bio_offset = offset; cbp->bio_length = length; cbp->bio_done = g_raid3_done; switch (pbp->bio_cmd) { case BIO_READ: if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { /* * Replace invalid component with the parity * component. */ disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; } else if (round_robin && disk->d_no == sc->sc_round_robin) { /* * In round-robin mode skip one data component * and use parity component when reading. */ pbp->bio_driver2 = disk; disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; sc->sc_round_robin++; round_robin = 0; } else if (verify && disk->d_no == sc->sc_ndisks - 1) { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } break; case BIO_WRITE: case BIO_DELETE: if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { if (n == ndisks - 1) { /* * Active parity component, mark it as such. */ cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } } else { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; if (n == ndisks - 1) { /* * Parity component is not connected, * so destroy its request. */ pbp->bio_pflags |= G_RAID3_BIO_PFLAG_NOPARITY; g_raid3_destroy_bio(sc, cbp); cbp = NULL; } else { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_NODISK; disk = NULL; } } break; } if (cbp != NULL) cbp->bio_caller2 = disk; } switch (pbp->bio_cmd) { case BIO_READ: if (round_robin) { /* * If we are in round-robin mode and 'round_robin' is * still 1, it means, that we skipped parity component * for this read and must reset sc_round_robin field. */ sc->sc_round_robin = 0; } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; case BIO_WRITE: case BIO_DELETE: /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, pbp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; g_raid3_bump_syncid(sc); } g_raid3_scatter(pbp); break; } return (0); } static int g_raid3_can_destroy(struct g_raid3_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_raid3_try_destroy(struct g_raid3_softc *sc) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_raid3_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); } return (1); } /* * Worker thread. */ static void g_raid3_worker(void *arg) { struct g_raid3_softc *sc; struct g_raid3_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_RAID3_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_raid3_event_get(sc); if (ep != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { /* Update only device status. */ G_RAID3_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_raid3_update_device(sc, 1); } else { /* Update disk status. */ G_RAID3_DEBUG(3, "Running event for disk %s.", g_raid3_get_diskname(ep->e_disk)); ep->e_error = g_raid3_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_raid3_update_device(sc, 0); } if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid3_event_free(ep); } else { ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_raid3_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); continue; } process: bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { g_raid3_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) g_raid3_regular_request(bp); else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) g_raid3_sync_request(bp); /* WRITE */ else { KASSERT(0, ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else if (g_raid3_register_request(bp) != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); /* * We are short in memory, let see if there are finished * request we can free. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) goto process; } /* * No finished regular request, so at least keep * synchronization running. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) goto process; } sx_xunlock(&sc->sc_lock); MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:lowmem", hz / 10); sx_xlock(&sc->sc_lock); } G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } } static void g_raid3_sync_start(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *bp; int error; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", sc->sc_name, sc->sc_state)); disk = NULL; for (n = 0; n < sc->sc_ndisks; n++) { if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) continue; disk = &sc->sc_disks[n]; break; } if (disk == NULL) return; sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_raid3_get_diskname(disk)); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_raid3_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; sc->sc_syncdisk = disk; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, M_RAID3, M_WAITOK); for (n = 0; n < g_raid3_syncreqs; n++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[n] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)n; } /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_raid3_syncreqs; /* * Fire off first synchronization requests. */ for (n = 0; n < g_raid3_syncreqs; n++) { bp = disk->d_sync.ds_bios[n]; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type) { struct g_raid3_disk *disk; struct g_consumer *cp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); disk = sc->sc_syncdisk; sc->sc_syncdisk = NULL; KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_raid3_get_diskname(disk)); } else /* if (type == 1) */ { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_raid3_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_RAID3); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_raid3_launch_provider(struct g_raid3_softc *sc) { struct g_provider *pp; struct g_raid3_disk *disk; int n; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_consumer && disk->d_consumer->provider && disk->d_consumer->provider->stripesize > pp->stripesize) { pp->stripesize = disk->d_consumer->provider->stripesize; pp->stripeoffset = disk->d_consumer->provider->stripeoffset; } } pp->stripesize *= sc->sc_ndisks - 1; pp->stripeoffset *= sc->sc_ndisks - 1; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks); if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) g_raid3_sync_start(sc); } static void g_raid3_destroy_provider(struct g_raid3_softc *sc) { struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); } static void g_raid3_go(void *arg) { struct g_raid3_softc *sc; sc = arg; G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_raid3_event_send(sc, 0, G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); } static u_int g_raid3_determine_state(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_RAID3_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that device was started on stale disks * and more fresh disk just arrive. * If there were writes, device is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_RAID3_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); state = G_RAID3_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_RAID3_DEBUG(3, "State for %s disk: %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) { struct g_raid3_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_RAID3_DEVICE_STATE_STARTING: { u_int n, ndirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * one disk is missing and 'force' is true. */ if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { if (!force) callout_drain(&sc->sc_callout); } else { if (force) { /* * Timeout expired, so destroy device. */ sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } return; } /* * Find the biggest genid. */ genid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid < genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", g_raid3_get_diskname(disk), sc->sc_name); g_raid3_destroy_disk(disk); } } /* * There must be at least 'sc->sc_ndisks - 1' components * with the same syncid and without SYNCHRONIZING flag. */ /* * Find the biggest syncid, number of valid components and * number of dirty components. */ ndirty = ndisks = syncid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) ndirty++; if (disk->d_sync.ds_syncid > syncid) { syncid = disk->d_sync.ds_syncid; ndisks = 0; } else if (disk->d_sync.ds_syncid < syncid) { continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; } /* * Do we have enough valid components? */ if (ndisks + 1 < sc->sc_ndisks) { G_RAID3_DEBUG(0, "Device %s is broken, too few valid components.", sc->sc_name); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } /* * If there is one DIRTY component and all disks are present, * mark it for synchronization. If there is more than one DIRTY * component, mark parity component for synchronization. */ if (ndisks == sc->sc_ndisks && ndirty == 1) { for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } } else if (ndisks == sc->sc_ndisks && ndirty > 1) { disk = &sc->sc_disks[sc->sc_ndisks - 1]; disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } if (ndisks == sc->sc_ndisks) state = G_RAID3_DEVICE_STATE_COMPLETE; else /* if (ndisks == sc->sc_ndisks - 1) */ state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; state = g_raid3_determine_state(disk); g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); if (state == G_RAID3_DISK_STATE_STALE) sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } break; } case G_RAID3_DEVICE_STATE_DEGRADED: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks) { state = G_RAID3_DEVICE_STATE_COMPLETE; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; case G_RAID3_DEVICE_STATE_COMPLETE: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= sc->sc_ndisks - 1, ("Too few ACTIVE components in COMPLETE state (device %s).", sc->sc_name)); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks - 1) { state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_raid3_get_diskname(disk), \ g_raid3_disk_state2str(disk->d_state), \ g_raid3_disk_state2str(state), sc->sc_name) static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) { struct g_raid3_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), g_raid3_disk_state2str(state)); switch (state) { case G_RAID3_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; G_RAID3_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_raid3_get_diskname(disk)); if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); state = g_raid3_determine_state(disk); if (state != G_RAID3_DISK_STATE_NONE) goto again; break; case G_RAID3_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_sync_stop(sc, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_raid3_update_idle(sc, disk); g_raid3_update_metadata(disk); G_RAID3_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_raid3_sync_start(sc); g_raid3_update_metadata(disk); } break; case G_RAID3_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_STALE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); break; default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = raid3_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) return (EINVAL); if (md->md_version > G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } if (md->md_sectorsize > MAXPHYS) { G_RAID3_DEBUG(0, "The blocksize is too big."); return (EINVAL); } return (0); } static int g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { if (md->md_no >= sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", pp->name, md->md_no); return (EINVAL); } if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", pp->name, md->md_no); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % md->md_sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != " "0) on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { G_RAID3_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " "disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { struct g_raid3_disk *disk; int error; g_topology_assert_not(); G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); error = g_raid3_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && md->md_genid < sc->sc_genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_raid3_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, G_RAID3_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_RAID3_VERSION); g_raid3_update_metadata(disk); } return (0); } static void g_raid3_destroy_delayed(void *arg, int flag) { struct g_raid3_softc *sc; int error; if (flag == EV_CANCEL) { G_RAID3_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name); error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT); if (error != 0) { G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid3_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 || g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_raid3_idle(sc, dcw); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_geom *gp; int error, timeout; u_int n; g_topology_assert(); G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, M_WAITOK | M_ZERO); gp->start = g_raid3_start; gp->orphan = g_raid3_orphan; gp->access = g_raid3_access; gp->dumpconf = g_raid3_dumpconf; sc->sc_id = md->md_id; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_round_robin = 0; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; for (n = 0; n < sc->sc_ndisks; n++) { sc->sc_disks[n].d_softc = sc; sc->sc_disks[n].d_no = n; sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; } sx_init(&sc->sc_lock, "graid3:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_raid3_orphan; sc->sc_sync.ds_geom = gp; if (!g_raid3_use_malloc) { sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; } error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, "g_raid3 %s", md->md_name); if (error != 0) { G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (NULL); } G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GRAID3"); G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = atomic_load_acq_int(&g_raid3_timeout); callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); return (sc->sc_geom); } int g_raid3_destroy(struct g_raid3_softc *sc, int how) { struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_RAID3_DESTROY_SOFT: G_RAID3_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_RAID3_DESTROY_DELAYED: G_RAID3_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_RAID3_DESTROY_HARD: G_RAID3_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); break; } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (0); } static void g_raid3_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID3_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "raid3:taste"); /* This orphan function should be never called. */ gp->orphan = g_raid3_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_raid3_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_raid3_debug >= 2) raid3_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) { G_RAID3_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_raid3_create(mp, &md); if (gp == NULL) { G_RAID3_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid3_add_disk(sc, pp, &md); if (error != 0) { G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == sc->sc_ndisks) { g_cancel_event(sc); g_raid3_destroy(sc, G_RAID3_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static int g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid3_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid3_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_raid3_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); if (disk->d_no == sc->sc_ndisks - 1) sbuf_printf(sb, "PARITY"); else sbuf_printf(sb, "DATA"); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_no); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / (sc->sc_mediasize / (sc->sc_ndisks - 1)))); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, g_raid3_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (!g_raid3_use_malloc) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); } sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, "ROUND-ROBIN"); ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s%s\n", indent, g_raid3_device_state2str(sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid3_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid3_softc *sc; int error; mp = arg; g_topology_lock(); g_raid3_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_raid3_idle(sc, -1); g_cancel_event(sc); error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); } static void g_raid3_init(struct g_class *mp) { g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid3_post_sync == NULL) G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_raid3_fini(struct g_class *mp) { if (g_raid3_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync); } DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); Index: head/sys/geom/raid3/g_raid3.h =================================================================== --- head/sys/geom/raid3/g_raid3.h (revision 326269) +++ head/sys/geom/raid3/g_raid3.h (revision 326270) @@ -1,476 +1,478 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_RAID3_H_ #define _G_RAID3_H_ #include #include #define G_RAID3_CLASS_NAME "RAID3" #define G_RAID3_MAGIC "GEOM::RAID3" /* * Version history: * 0 - Initial version number. * 1 - Added 'round-robin reading' algorithm. * 2 - Added 'verify reading' algorithm. * 3 - Added md_genid field to metadata. * 4 - Added md_provsize field to metadata. * 5 - Added 'no failure synchronization' flag. */ #define G_RAID3_VERSION 5 #define G_RAID3_DISK_FLAG_DIRTY 0x0000000000000001ULL #define G_RAID3_DISK_FLAG_SYNCHRONIZING 0x0000000000000002ULL #define G_RAID3_DISK_FLAG_FORCE_SYNC 0x0000000000000004ULL #define G_RAID3_DISK_FLAG_HARDCODED 0x0000000000000008ULL #define G_RAID3_DISK_FLAG_BROKEN 0x0000000000000010ULL #define G_RAID3_DISK_FLAG_MASK (G_RAID3_DISK_FLAG_DIRTY | \ G_RAID3_DISK_FLAG_SYNCHRONIZING | \ G_RAID3_DISK_FLAG_FORCE_SYNC) #define G_RAID3_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_RAID3_DEVICE_FLAG_ROUND_ROBIN 0x0000000000000002ULL #define G_RAID3_DEVICE_FLAG_VERIFY 0x0000000000000004ULL #define G_RAID3_DEVICE_FLAG_NOFAILSYNC 0x0000000000000008ULL #define G_RAID3_DEVICE_FLAG_MASK (G_RAID3_DEVICE_FLAG_NOAUTOSYNC | \ G_RAID3_DEVICE_FLAG_ROUND_ROBIN | \ G_RAID3_DEVICE_FLAG_VERIFY | \ G_RAID3_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL extern u_int g_raid3_debug; #define G_RAID3_DEBUG(lvl, ...) do { \ if (g_raid3_debug >= (lvl)) { \ printf("GEOM_RAID3"); \ if (g_raid3_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_RAID3_LOGREQ(lvl, bp, ...) do { \ if (g_raid3_debug >= (lvl)) { \ printf("GEOM_RAID3"); \ if (g_raid3_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #define G_RAID3_BIO_CFLAG_REGULAR 0x01 #define G_RAID3_BIO_CFLAG_SYNC 0x02 #define G_RAID3_BIO_CFLAG_PARITY 0x04 #define G_RAID3_BIO_CFLAG_NODISK 0x08 #define G_RAID3_BIO_CFLAG_REGSYNC 0x10 #define G_RAID3_BIO_CFLAG_MASK (G_RAID3_BIO_CFLAG_REGULAR | \ G_RAID3_BIO_CFLAG_SYNC | \ G_RAID3_BIO_CFLAG_PARITY | \ G_RAID3_BIO_CFLAG_NODISK | \ G_RAID3_BIO_CFLAG_REGSYNC) #define G_RAID3_BIO_PFLAG_DEGRADED 0x01 #define G_RAID3_BIO_PFLAG_NOPARITY 0x02 #define G_RAID3_BIO_PFLAG_VERIFY 0x04 #define G_RAID3_BIO_PFLAG_MASK (G_RAID3_BIO_PFLAG_DEGRADED | \ G_RAID3_BIO_PFLAG_NOPARITY | \ G_RAID3_BIO_PFLAG_VERIFY) /* * Informations needed for synchronization. */ struct g_raid3_disk_sync { struct g_consumer *ds_consumer; /* Consumer connected to our device. */ off_t ds_offset; /* Offset of next request to send. */ off_t ds_offset_done; /* Offset of already synchronized region. */ off_t ds_resync; /* Resynchronize from this offset. */ u_int ds_syncid; /* Disk's synchronization ID. */ u_int ds_inflight; /* Number of in-flight sync requests. */ struct bio **ds_bios; /* BIOs for synchronization I/O. */ }; /* * Informations needed for synchronization. */ struct g_raid3_device_sync { struct g_geom *ds_geom; /* Synchronization geom. */ }; #define G_RAID3_DISK_STATE_NODISK 0 #define G_RAID3_DISK_STATE_NONE 1 #define G_RAID3_DISK_STATE_NEW 2 #define G_RAID3_DISK_STATE_ACTIVE 3 #define G_RAID3_DISK_STATE_STALE 4 #define G_RAID3_DISK_STATE_SYNCHRONIZING 5 #define G_RAID3_DISK_STATE_DISCONNECTED 6 #define G_RAID3_DISK_STATE_DESTROY 7 struct g_raid3_disk { u_int d_no; /* Disk number. */ struct g_consumer *d_consumer; /* Consumer. */ struct g_raid3_softc *d_softc; /* Back-pointer to softc. */ int d_state; /* Disk state. */ uint64_t d_flags; /* Additional flags. */ u_int d_genid; /* Disk's generation ID. */ struct g_raid3_disk_sync d_sync; /* Sync information. */ LIST_ENTRY(g_raid3_disk) d_next; }; #define d_name d_consumer->provider->name #define G_RAID3_EVENT_DONTWAIT 0x1 #define G_RAID3_EVENT_WAIT 0x2 #define G_RAID3_EVENT_DEVICE 0x4 #define G_RAID3_EVENT_DONE 0x8 struct g_raid3_event { struct g_raid3_disk *e_disk; int e_state; int e_flags; int e_error; TAILQ_ENTRY(g_raid3_event) e_next; }; #define G_RAID3_DEVICE_FLAG_DESTROY 0x0100000000000000ULL #define G_RAID3_DEVICE_FLAG_WAIT 0x0200000000000000ULL #define G_RAID3_DEVICE_FLAG_DESTROYING 0x0400000000000000ULL #define G_RAID3_DEVICE_STATE_STARTING 0 #define G_RAID3_DEVICE_STATE_DEGRADED 1 #define G_RAID3_DEVICE_STATE_COMPLETE 2 /* Bump syncid on first write. */ #define G_RAID3_BUMP_SYNCID 0x1 /* Bump genid immediately. */ #define G_RAID3_BUMP_GENID 0x2 enum g_raid3_zones { G_RAID3_ZONE_64K, G_RAID3_ZONE_16K, G_RAID3_ZONE_4K, G_RAID3_NUM_ZONES }; static __inline enum g_raid3_zones g_raid3_zone(size_t nbytes) { if (nbytes > 65536) return (G_RAID3_NUM_ZONES); else if (nbytes > 16384) return (G_RAID3_ZONE_64K); else if (nbytes > 4096) return (G_RAID3_ZONE_16K); else return (G_RAID3_ZONE_4K); }; struct g_raid3_softc { u_int sc_state; /* Device state. */ uint64_t sc_mediasize; /* Device size. */ uint32_t sc_sectorsize; /* Sector size. */ uint64_t sc_flags; /* Additional flags. */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* Device unique ID. */ struct sx sc_lock; struct bio_queue_head sc_queue; struct mtx sc_queue_mtx; struct proc *sc_worker; struct bio_queue_head sc_regular_delayed; /* Delayed I/O requests due collision with sync requests. */ struct bio_queue_head sc_inflight; /* In-flight regular write requests. */ struct bio_queue_head sc_sync_delayed; /* Delayed sync requests due collision with regular requests. */ struct g_raid3_disk *sc_disks; u_int sc_ndisks; /* Number of disks. */ u_int sc_round_robin; struct g_raid3_disk *sc_syncdisk; struct g_raid3_zone { uma_zone_t sz_zone; size_t sz_inuse; size_t sz_max; u_int sz_requested; u_int sz_failed; } sc_zones[G_RAID3_NUM_ZONES]; u_int sc_genid; /* Generation ID. */ u_int sc_syncid; /* Synchronization ID. */ int sc_bump_id; struct g_raid3_device_sync sc_sync; int sc_idle; /* DIRTY flags removed. */ time_t sc_last_write; u_int sc_writes; TAILQ_HEAD(, g_raid3_event) sc_events; struct mtx sc_events_mtx; struct callout sc_callout; struct root_hold_token *sc_rootmount; }; #define sc_name sc_geom->name const char *g_raid3_get_diskname(struct g_raid3_disk *disk); u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state); #define G_RAID3_DESTROY_SOFT 0 #define G_RAID3_DESTROY_DELAYED 1 #define G_RAID3_DESTROY_HARD 2 int g_raid3_destroy(struct g_raid3_softc *sc, int how); int g_raid3_event_send(void *arg, int state, int flags); struct g_raid3_metadata; int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md); int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md); void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md); int g_raid3_clear_metadata(struct g_raid3_disk *disk); void g_raid3_update_metadata(struct g_raid3_disk *disk); g_ctl_req_t g_raid3_config; #endif /* _KERNEL */ struct g_raid3_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Device name. */ uint32_t md_id; /* Device unique ID. */ uint16_t md_no; /* Component number. */ uint16_t md_all; /* Number of disks in device. */ uint32_t md_genid; /* Generation ID. */ uint32_t md_syncid; /* Synchronization ID. */ uint64_t md_mediasize; /* Size of whole device. */ uint32_t md_sectorsize; /* Sector size. */ uint64_t md_sync_offset; /* Synchronized offset. */ uint64_t md_mflags; /* Additional device flags. */ uint64_t md_dflags; /* Additional disk flags. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ u_char md_hash[16]; /* MD5 hash. */ }; static __inline void raid3_metadata_encode(struct g_raid3_metadata *md, u_char *data) { MD5_CTX ctx; bcopy(md->md_magic, data, 16); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, 16); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); le32enc(data + 44, md->md_genid); le32enc(data + 48, md->md_syncid); le64enc(data + 52, md->md_mediasize); le32enc(data + 60, md->md_sectorsize); le64enc(data + 64, md->md_sync_offset); le64enc(data + 72, md->md_mflags); le64enc(data + 80, md->md_dflags); bcopy(md->md_provider, data + 88, 16); le64enc(data + 104, md->md_provsize); MD5Init(&ctx); MD5Update(&ctx, data, 112); MD5Final(md->md_hash, &ctx); bcopy(md->md_hash, data + 112, 16); } static __inline int raid3_metadata_decode_v0v1v2(const u_char *data, struct g_raid3_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_syncid = le32dec(data + 44); md->md_mediasize = le64dec(data + 48); md->md_sectorsize = le32dec(data + 56); md->md_sync_offset = le64dec(data + 60); md->md_mflags = le64dec(data + 68); md->md_dflags = le64dec(data + 76); bcopy(data + 84, md->md_provider, 16); bcopy(data + 100, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 100); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 100, 16) != 0) return (EINVAL); /* New fields. */ md->md_genid = 0; md->md_provsize = 0; return (0); } static __inline int raid3_metadata_decode_v3(const u_char *data, struct g_raid3_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_genid = le32dec(data + 44); md->md_syncid = le32dec(data + 48); md->md_mediasize = le64dec(data + 52); md->md_sectorsize = le32dec(data + 60); md->md_sync_offset = le64dec(data + 64); md->md_mflags = le64dec(data + 72); md->md_dflags = le64dec(data + 80); bcopy(data + 88, md->md_provider, 16); bcopy(data + 104, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 104); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 104, 16) != 0) return (EINVAL); /* New fields. */ md->md_provsize = 0; return (0); } static __inline int raid3_metadata_decode_v4v5(const u_char *data, struct g_raid3_metadata *md) { MD5_CTX ctx; bcopy(data + 20, md->md_name, 16); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_genid = le32dec(data + 44); md->md_syncid = le32dec(data + 48); md->md_mediasize = le64dec(data + 52); md->md_sectorsize = le32dec(data + 60); md->md_sync_offset = le64dec(data + 64); md->md_mflags = le64dec(data + 72); md->md_dflags = le64dec(data + 80); bcopy(data + 88, md->md_provider, 16); md->md_provsize = le64dec(data + 104); bcopy(data + 112, md->md_hash, 16); MD5Init(&ctx); MD5Update(&ctx, data, 112); MD5Final(md->md_hash, &ctx); if (bcmp(md->md_hash, data + 112, 16) != 0) return (EINVAL); return (0); } static __inline int raid3_metadata_decode(const u_char *data, struct g_raid3_metadata *md) { int error; bcopy(data, md->md_magic, 16); md->md_version = le32dec(data + 16); switch (md->md_version) { case 0: case 1: case 2: error = raid3_metadata_decode_v0v1v2(data, md); break; case 3: error = raid3_metadata_decode_v3(data, md); break; case 4: case 5: error = raid3_metadata_decode_v4v5(data, md); break; default: error = EINVAL; break; } return (error); } static __inline void raid3_metadata_dump(const struct g_raid3_metadata *md) { static const char hex[] = "0123456789abcdef"; char hash[16 * 2 + 1]; u_int i; printf(" magic: %s\n", md->md_magic); printf(" version: %u\n", (u_int)md->md_version); printf(" name: %s\n", md->md_name); printf(" id: %u\n", (u_int)md->md_id); printf(" no: %u\n", (u_int)md->md_no); printf(" all: %u\n", (u_int)md->md_all); printf(" genid: %u\n", (u_int)md->md_genid); printf(" syncid: %u\n", (u_int)md->md_syncid); printf(" mediasize: %jd\n", (intmax_t)md->md_mediasize); printf("sectorsize: %u\n", (u_int)md->md_sectorsize); printf("syncoffset: %jd\n", (intmax_t)md->md_sync_offset); printf(" mflags:"); if (md->md_mflags == 0) printf(" NONE"); else { if ((md->md_mflags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) printf(" NOAUTOSYNC"); if ((md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) printf(" ROUND-ROBIN"); if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0) printf(" VERIFY"); if ((md->md_mflags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) printf(" NOFAILSYNC"); } printf("\n"); printf(" dflags:"); if (md->md_dflags == 0) printf(" NONE"); else { if ((md->md_dflags & G_RAID3_DISK_FLAG_DIRTY) != 0) printf(" DIRTY"); if ((md->md_dflags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) printf(" SYNCHRONIZING"); if ((md->md_dflags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) printf(" FORCE_SYNC"); } printf("\n"); printf("hcprovider: %s\n", md->md_provider); printf(" provsize: %ju\n", (uintmax_t)md->md_provsize); bzero(hash, sizeof(hash)); for (i = 0; i < 16; i++) { hash[i * 2] = hex[md->md_hash[i] >> 4]; hash[i * 2 + 1] = hex[md->md_hash[i] & 0x0f]; } printf(" MD5 hash: %s\n", hash); } #endif /* !_G_RAID3_H_ */ Index: head/sys/geom/raid3/g_raid3_ctl.c =================================================================== --- head/sys/geom/raid3/g_raid3_ctl.c (revision 326269) +++ head/sys/geom/raid3/g_raid3_ctl.c (revision 326270) @@ -1,641 +1,643 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct g_raid3_softc * g_raid3_find_device(struct g_class *mp, const char *name) { struct g_raid3_softc *sc; struct g_geom *gp; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) continue; if (strcmp(gp->name, name) == 0 || strcmp(sc->sc_name, name) == 0) { g_topology_unlock(); sx_xlock(&sc->sc_lock); return (sc); } } g_topology_unlock(); return (NULL); } static struct g_raid3_disk * g_raid3_find_disk(struct g_raid3_softc *sc, const char *name) { struct g_raid3_disk *disk; u_int n; sx_assert(&sc->sc_lock, SX_XLOCKED); if (strncmp(name, "/dev/", 5) == 0) name += 5; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_consumer == NULL) continue; if (disk->d_consumer->provider == NULL) continue; if (strcmp(disk->d_consumer->provider->name, name) == 0) return (disk); } return (NULL); } static void g_raid3_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; const char *name; int *nargs, do_sync = 0, dirty = 1; int *autosync, *noautosync; int *failsync, *nofailsync; int *round_robin, *noround_robin; int *verify, *noverify; u_int n; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } autosync = gctl_get_paraml(req, "autosync", sizeof(*autosync)); if (autosync == NULL) { gctl_error(req, "No '%s' argument.", "autosync"); return; } noautosync = gctl_get_paraml(req, "noautosync", sizeof(*noautosync)); if (noautosync == NULL) { gctl_error(req, "No '%s' argument.", "noautosync"); return; } if (*autosync && *noautosync) { gctl_error(req, "'%s' and '%s' specified.", "autosync", "noautosync"); return; } failsync = gctl_get_paraml(req, "failsync", sizeof(*failsync)); if (failsync == NULL) { gctl_error(req, "No '%s' argument.", "failsync"); return; } nofailsync = gctl_get_paraml(req, "nofailsync", sizeof(*nofailsync)); if (nofailsync == NULL) { gctl_error(req, "No '%s' argument.", "nofailsync"); return; } if (*failsync && *nofailsync) { gctl_error(req, "'%s' and '%s' specified.", "failsync", "nofailsync"); return; } round_robin = gctl_get_paraml(req, "round_robin", sizeof(*round_robin)); if (round_robin == NULL) { gctl_error(req, "No '%s' argument.", "round_robin"); return; } noround_robin = gctl_get_paraml(req, "noround_robin", sizeof(*noround_robin)); if (noround_robin == NULL) { gctl_error(req, "No '%s' argument.", "noround_robin"); return; } if (*round_robin && *noround_robin) { gctl_error(req, "'%s' and '%s' specified.", "round_robin", "noround_robin"); return; } verify = gctl_get_paraml(req, "verify", sizeof(*verify)); if (verify == NULL) { gctl_error(req, "No '%s' argument.", "verify"); return; } noverify = gctl_get_paraml(req, "noverify", sizeof(*noverify)); if (noverify == NULL) { gctl_error(req, "No '%s' argument.", "noverify"); return; } if (*verify && *noverify) { gctl_error(req, "'%s' and '%s' specified.", "verify", "noverify"); return; } if (!*autosync && !*noautosync && !*failsync && !*nofailsync && !*round_robin && !*noround_robin && !*verify && !*noverify) { gctl_error(req, "Nothing has changed."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (g_raid3_ndisks(sc, -1) < sc->sc_ndisks) { gctl_error(req, "Not all disks connected."); sx_xunlock(&sc->sc_lock); return; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) { if (*autosync) { sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_NOAUTOSYNC; do_sync = 1; } } else { if (*noautosync) sc->sc_flags |= G_RAID3_DEVICE_FLAG_NOAUTOSYNC; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) { if (*failsync) sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_NOFAILSYNC; } else { if (*nofailsync) { sc->sc_flags |= G_RAID3_DEVICE_FLAG_NOFAILSYNC; dirty = 0; } } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0) { if (*noverify) sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_VERIFY; } else { if (*verify) sc->sc_flags |= G_RAID3_DEVICE_FLAG_VERIFY; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { if (*noround_robin) sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_ROUND_ROBIN; } else { if (*round_robin) sc->sc_flags |= G_RAID3_DEVICE_FLAG_ROUND_ROBIN; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ sc->sc_flags &= ~G_RAID3_DEVICE_FLAG_ROUND_ROBIN; } for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (do_sync) { if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; } if (!dirty) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); if (do_sync) { if (disk->d_state == G_RAID3_DISK_STATE_STALE) { /* * XXX: This is probably possible that this * component will not be retasted. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } } sx_xunlock(&sc->sc_lock); } static void g_raid3_ctl_rebuild(struct gctl_req *req, struct g_class *mp) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_provider *pp; const char *name; int error, *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 1); sx_xunlock(&sc->sc_lock); return; } disk = g_raid3_find_disk(sc, name); if (disk == NULL) { gctl_error(req, "No such provider: %s.", name); sx_xunlock(&sc->sc_lock); return; } if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE && g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks) { gctl_error(req, "There is one stale disk already."); sx_xunlock(&sc->sc_lock); return; } /* * Do rebuild by resetting syncid and disconnecting disk. * It'll be retasted, connected to the device and synchronized. */ disk->d_sync.ds_syncid = 0; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0) disk->d_flags |= G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_update_metadata(disk); pp = disk->d_consumer->provider; g_topology_lock(); error = g_raid3_read_metadata(disk->d_consumer, &md); g_topology_unlock(); g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_WAIT); if (error != 0) { gctl_error(req, "Cannot read metadata from %s.", pp->name); sx_xunlock(&sc->sc_lock); return; } error = g_raid3_add_disk(sc, pp, &md); if (error != 0) gctl_error(req, "Cannot reconnect component %s.", pp->name); sx_xunlock(&sc->sc_lock); } static void g_raid3_ctl_stop(struct gctl_req *req, struct g_class *mp) { struct g_raid3_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; int how; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } if (*force) how = G_RAID3_DESTROY_HARD; else how = G_RAID3_DESTROY_SOFT; for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } g_cancel_event(sc); error = g_raid3_destroy(sc, how); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_geom->name, error); sx_xunlock(&sc->sc_lock); return; } /* No need to unlock, because lock is already dead. */ } } static void g_raid3_ctl_insert_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while inserting %s.", __func__, cp->provider->name)); } static void g_raid3_ctl_insert(struct gctl_req *req, struct g_class *mp) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; const char *name; u_char *sector; off_t compsize; intmax_t *no; int *hardcode, *nargs, error, autono; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 2) { gctl_error(req, "Invalid number of arguments."); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "No '%s' argument.", "hardcode"); return; } name = gctl_get_asciiparam(req, "arg1"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 1); return; } if (gctl_get_param(req, "number", NULL) != NULL) no = gctl_get_paraml(req, "number", sizeof(*no)); else no = NULL; if (strncmp(name, "/dev/", 5) == 0) name += 5; g_topology_lock(); pp = g_provider_by_name(name); if (pp == NULL) { g_topology_unlock(); gctl_error(req, "Invalid provider."); return; } gp = g_new_geomf(mp, "raid3:insert"); gp->orphan = g_raid3_ctl_insert_orphan; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_topology_unlock(); gctl_error(req, "Cannot attach to %s.", pp->name); goto end; } error = g_access(cp, 0, 1, 1); if (error != 0) { g_topology_unlock(); gctl_error(req, "Cannot access %s.", pp->name); goto end; } g_topology_unlock(); name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); goto end; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); goto end; } if (no != NULL) { if (*no < 0 || *no >= sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Invalid component number."); goto end; } disk = &sc->sc_disks[*no]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Component %jd is already connected.", *no); goto end; } } else { disk = NULL; for (autono = 0; autono < sc->sc_ndisks && disk == NULL; autono++) if (sc->sc_disks[autono].d_state == G_RAID3_DISK_STATE_NODISK) disk = &sc->sc_disks[autono]; if (disk == NULL) { sx_xunlock(&sc->sc_lock); gctl_error(req, "No disconnected components."); goto end; } } if (((sc->sc_sectorsize / (sc->sc_ndisks - 1)) % pp->sectorsize) != 0) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Cannot insert provider %s, because of its sector size.", pp->name); goto end; } compsize = sc->sc_mediasize / (sc->sc_ndisks - 1); if (compsize > pp->mediasize - pp->sectorsize) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Provider %s too small.", pp->name); goto end; } if (compsize < pp->mediasize - pp->sectorsize) { gctl_error(req, "warning: %s: only %jd bytes from %jd bytes used.", pp->name, (intmax_t)compsize, (intmax_t)(pp->mediasize - pp->sectorsize)); } g_raid3_fill_metadata(disk, &md); sx_xunlock(&sc->sc_lock); md.md_syncid = 0; md.md_dflags = 0; if (*hardcode) strlcpy(md.md_provider, pp->name, sizeof(md.md_provider)); else bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = pp->mediasize; sector = g_malloc(pp->sectorsize, M_WAITOK); raid3_metadata_encode(&md, sector); error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector, pp->sectorsize); g_free(sector); if (error != 0) gctl_error(req, "Cannot store metadata on %s.", pp->name); end: g_topology_lock(); if (cp->acw > 0) g_access(cp, 0, -1, -1); if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); g_topology_unlock(); } static void g_raid3_ctl_remove(struct gctl_req *req, struct g_class *mp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; const char *name; intmax_t *no; int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs != 1) { gctl_error(req, "Invalid number of arguments."); return; } no = gctl_get_paraml(req, "number", sizeof(*no)); if (no == NULL) { gctl_error(req, "No '%s' argument.", "no"); return; } name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } sc = g_raid3_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } if (*no >= sc->sc_ndisks) { sx_xunlock(&sc->sc_lock); gctl_error(req, "Invalid component number."); return; } disk = &sc->sc_disks[*no]; switch (disk->d_state) { case G_RAID3_DISK_STATE_ACTIVE: /* * When replacing ACTIVE component, all the rest has to be also * ACTIVE. */ if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks) { gctl_error(req, "Cannot replace component number %jd.", *no); break; } /* FALLTHROUGH */ case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_SYNCHRONIZING: if (g_raid3_clear_metadata(disk) != 0) { gctl_error(req, "Cannot clear metadata on %s.", g_raid3_get_diskname(disk)); } else { g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } break; case G_RAID3_DISK_STATE_NODISK: break; default: gctl_error(req, "Cannot replace component number %jd.", *no); break; } sx_xunlock(&sc->sc_lock); } void g_raid3_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_RAID3_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } g_topology_unlock(); if (strcmp(verb, "configure") == 0) g_raid3_ctl_configure(req, mp); else if (strcmp(verb, "insert") == 0) g_raid3_ctl_insert(req, mp); else if (strcmp(verb, "rebuild") == 0) g_raid3_ctl_rebuild(req, mp); else if (strcmp(verb, "remove") == 0) g_raid3_ctl_remove(req, mp); else if (strcmp(verb, "stop") == 0) g_raid3_ctl_stop(req, mp); else gctl_error(req, "Unknown verb."); g_topology_lock(); } Index: head/sys/geom/sched/g_sched.c =================================================================== --- head/sys/geom/sched/g_sched.c (revision 326269) +++ head/sys/geom/sched/g_sched.c (revision 326270) @@ -1,1726 +1,1728 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * Main control module for geom-based disk schedulers ('sched'). * * USER VIEW * A 'sched' node is typically inserted transparently between * an existing provider pp and its original geom gp * * [pp --> gp ..] * * using the command "geom sched insert " and * resulting in the following topology * * [pp --> sched_gp --> cp] [new_pp --> gp ... ] * * Deletion "geom sched destroy .sched." restores the * original chain. The normal "geom sched create " * is also supported. * * INTERNALS * Internally, the 'sched' uses the following data structures * * geom{} g_sched_softc{} g_gsched{} * +----------+ +---------------+ +-------------+ * | softc *-|--->| sc_gsched *-|-->| gs_init | * | ... | | | | gs_fini | * | | | [ hash table] | | gs_start | * +----------+ | | | ... | * | | +-------------+ * | | * | | g_*_softc{} * | | +-------------+ * | sc_data *-|-->| | * +---------------+ | algorithm- | * | specific | * +-------------+ * * A g_sched_softc{} is created with a "geom sched insert" call. * In turn this instantiates a specific scheduling algorithm, * which sets sc_gsched to point to the algorithm callbacks, * and calls gs_init() to create the g_*_softc{} . * The other callbacks (gs_start, gs_next, ...) are invoked * as needed * * g_sched_softc{} is defined in g_sched.h and mostly used here; * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) * * DATA MOVING * When a bio is received on the provider, it goes to the * g_sched_start() which calls gs_start() to initially queue it; * then we call g_sched_dispatch() that loops around gs_next() * to select zero or more bio's to be sent downstream. * * g_sched_dispatch() can also be called as a result of a timeout, * e.g. when doing anticipation or pacing requests. * * When a bio comes back, it goes to g_sched_done() which in turn * calls gs_done(). The latter does any necessary housekeeping in * the scheduling algorithm, and may decide to call g_sched_dispatch() * to send more bio's downstream. * * If an algorithm needs per-flow queues, these are created * calling gs_init_class() and destroyed with gs_fini_class(), * and they are also inserted in the hash table implemented in * the g_sched_softc{} * * If an algorithm is replaced, or a transparently-inserted node is * removed with "geom sched destroy", we need to remove all references * to the g_*_softc{} and g_sched_softc from the bio's still in * the scheduler. g_sched_forced_dispatch() helps doing this. * XXX need to explain better. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* we access curthread */ #include #include "gs_scheduler.h" #include "g_sched.h" /* geom hooks */ /* * Size of the per-geom hash table storing traffic classes. * We may decide to change it at a later time, it has no ABI * implications as it is only used for run-time allocations. */ #define G_SCHED_HASH_SIZE 32 static int g_sched_destroy(struct g_geom *gp, boolean_t force); static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb); static struct g_geom *g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused); static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_sched_init(struct g_class *mp); static void g_sched_fini(struct g_class *mp); static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td); struct g_class g_sched_class = { .name = G_SCHED_CLASS_NAME, .version = G_VERSION, .ctlreq = g_sched_config, .taste = g_sched_taste, .destroy_geom = g_sched_destroy_geom, .init = g_sched_init, .ioctl = g_sched_ioctl, .fini = g_sched_fini }; MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); /* * Global variables describing the state of the geom_sched module. * There is only one static instance of this structure. */ LIST_HEAD(gs_list, g_gsched); /* type, link field */ struct geom_sched_vars { struct mtx gs_mtx; struct gs_list gs_scheds; /* list of algorithms */ u_int gs_debug; u_int gs_sched_count; /* how many algorithms ? */ u_int gs_patched; /* g_io_request was patched */ u_int gs_initialized; u_int gs_expire_secs; /* expiration of hash entries */ struct bio_queue_head gs_pending; u_int gs_npending; /* The following are for stats, usually protected by gs_mtx. */ u_long gs_requests; /* total requests */ u_long gs_done; /* total done */ u_int gs_in_flight; /* requests in flight */ u_int gs_writes_in_flight; u_int gs_bytes_in_flight; u_int gs_write_bytes_in_flight; char gs_names[256]; /* names of schedulers */ }; static struct geom_sched_vars me = { .gs_expire_secs = 10, }; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, "GEOM_SCHED stuff"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, &me.gs_bytes_in_flight, 0, "Bytes in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, &me.gs_writes_in_flight, 0, "Write Requests in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, &me.gs_in_flight, 0, "Requests in flight"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, &me.gs_done, 0, "Total done"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, &me.gs_requests, 0, "Total requests"); SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, &me.gs_names, 0, "Algorithm names"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, &me.gs_sched_count, 0, "Number of algorithms"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, &me.gs_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, &me.gs_expire_secs, 0, "Expire time in seconds"); /* * g_sched calls the scheduler algorithms with this lock held. * The locking functions are exposed so the scheduler algorithms can also * protect themselves e.g. when running a callout handler. */ void g_sched_lock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_lock(&sc->sc_mtx); } void g_sched_unlock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_unlock(&sc->sc_mtx); } /* * Support functions to handle references to the module, * which are coming from devices using this scheduler. */ static inline void g_gsched_ref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, 1); } static inline void g_gsched_unref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, -1); } /* * Update the stats when this request is done. */ static void g_sched_update_stats(struct bio *bio) { me.gs_done++; me.gs_in_flight--; me.gs_bytes_in_flight -= bio->bio_length; if (bio->bio_cmd == BIO_WRITE) { me.gs_writes_in_flight--; me.gs_write_bytes_in_flight -= bio->bio_length; } } /* * Dispatch any pending request. */ static void g_sched_forced_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during forced dispatch")); while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * The main dispatch loop, called either here after the start * routine, or by scheduling algorithms when they receive a timeout * or a 'done' notification. Does not share code with the forced * dispatch path, since the gs_done() callback can call us. */ void g_sched_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); if ((sc->sc_flags & G_SCHED_FLUSHING)) return; while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * Recent (8.0 and above) versions of FreeBSD have support to * register classifiers of disk requests. The classifier is * invoked by g_io_request(), and stores the information into * bp->bio_classifier1. * * Support for older versions, which is left here only for * documentation purposes, relies on two hacks: * 1. classification info is written into the bio_caller1 * field of the topmost node in the bio chain. This field * is rarely used, but this module is incompatible with * those that use bio_caller1 for other purposes, * such as ZFS and gjournal; * 2. g_io_request() is patched in-memory when the module is * loaded, so that the function calls a classifier as its * first thing. g_io_request() is restored when the module * is unloaded. This functionality is only supported for * x86 and amd64, other architectures need source code changes. */ /* * Lookup the identity of the issuer of the original request. * In the current implementation we use the curthread of the * issuer, but different mechanisms may be implemented later * so we do not make assumptions on the return value which for * us is just an opaque identifier. */ static inline u_long g_sched_classify(struct bio *bp) { /* we have classifier fields in the struct bio */ return ((u_long)bp->bio_classifier1); } /* Return the hash chain for the given key. */ static inline struct g_hash * g_sched_hash(struct g_sched_softc *sc, u_long key) { return (&sc->sc_hash[key & sc->sc_mask]); } /* * Helper function for the children classes, which takes * a geom and a bio and returns the private descriptor * associated to the request. This involves fetching * the classification field and [al]locating the * corresponding entry in the hash table. */ void * g_sched_get_class(struct g_geom *gp, struct bio *bp) { struct g_sched_softc *sc; struct g_sched_class *gsc; struct g_gsched *gsp; struct g_hash *bucket; u_long key; sc = gp->softc; key = g_sched_classify(bp); bucket = g_sched_hash(sc, key); LIST_FOREACH(gsc, bucket, gsc_clist) { if (key == gsc->gsc_key) { gsc->gsc_refs++; return (gsc->gsc_priv); } } gsp = sc->sc_gsched; gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, M_GEOM_SCHED, M_NOWAIT | M_ZERO); if (!gsc) return (NULL); if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { free(gsc, M_GEOM_SCHED); return (NULL); } gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ gsc->gsc_key = key; LIST_INSERT_HEAD(bucket, gsc, gsc_clist); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; return (gsc->gsc_priv); } /* * Release a reference to the per-client descriptor, */ void g_sched_put_class(struct g_geom *gp, void *priv) { struct g_sched_class *gsc; struct g_sched_softc *sc; gsc = g_sched_priv2class(priv); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; if (--gsc->gsc_refs > 0) return; sc = gp->softc; sc->sc_gsched->gs_fini_class(sc->sc_data, priv); LIST_REMOVE(gsc, gsc_clist); free(gsc, M_GEOM_SCHED); } static void g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, struct g_gsched *gsp, void *data) { struct g_sched_class *cp, *cp2; int i; if (!hp) return; if (data && gsp->gs_hash_unref) gsp->gs_hash_unref(data); for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) g_sched_put_class(gp, cp->gsc_priv); } hashdestroy(hp, M_GEOM_SCHED, mask); } static struct g_hash * g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) { struct g_hash *hash; if (gsp->gs_priv_size == 0) return (NULL); hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); return (hash); } static void g_sched_flush_classes(struct g_geom *gp) { struct g_sched_softc *sc; struct g_sched_class *cp, *cp2; int i; sc = gp->softc; if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) return; for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) g_sched_put_class(gp, cp->gsc_priv); } } sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; } /* * Wait for the completion of any outstanding request. To ensure * that this does not take forever the caller has to make sure that * no new request enter the scehduler before calling us. * * Must be called with the gp mutex held and topology locked. */ static int g_sched_wait_pending(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; int endticks = ticks + hz; g_topology_assert(); while (sc->sc_pending && endticks - ticks >= 0) msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); return (sc->sc_pending ? ETIMEDOUT : 0); } static int g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) { struct g_sched_softc *sc = gp->softc; int error; /* Set the flushing flag: new bios will not enter the scheduler. */ sc->sc_flags |= G_SCHED_FLUSHING; g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) goto failed; /* No more requests pending or in flight from the old gsp. */ g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; /* * Avoid deadlock here by releasing the gp mutex and reacquiring * it once done. It should be safe, since no reconfiguration or * destruction can take place due to the geom topology lock; no * new request can use the current sc_data since we flagged the * geom as being flushed. */ g_sched_unlock(gp); gsp->gs_fini(sc->sc_data); g_sched_lock(gp); sc->sc_gsched = NULL; sc->sc_data = NULL; g_gsched_unref(gsp); failed: sc->sc_flags &= ~G_SCHED_FLUSHING; return (error); } static int g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) { int error; g_sched_lock(gp); error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ g_sched_unlock(gp); return (error); } /* * Support function for create/taste -- locate the desired * algorithm and grab a reference to it. */ static struct g_gsched * g_gsched_find(const char *name) { struct g_gsched *gsp = NULL; mtx_lock(&me.gs_mtx); LIST_FOREACH(gsp, &me.gs_scheds, glist) { if (strcmp(name, gsp->gs_name) == 0) { g_gsched_ref(gsp); break; } } mtx_unlock(&me.gs_mtx); return (gsp); } /* * Rebuild the list of scheduler names. * To be called with me.gs_mtx lock held. */ static void g_gsched_build_names(struct g_gsched *gsp) { int pos, l; struct g_gsched *cur; pos = 0; LIST_FOREACH(cur, &me.gs_scheds, glist) { l = strlen(cur->gs_name); if (l + pos + 1 + 1 < sizeof(me.gs_names)) { if (pos != 0) me.gs_names[pos++] = ' '; strcpy(me.gs_names + pos, cur->gs_name); pos += l; } } me.gs_names[pos] = '\0'; } /* * Register or unregister individual scheduling algorithms. */ static int g_gsched_register(struct g_gsched *gsp) { struct g_gsched *cur; int error = 0; mtx_lock(&me.gs_mtx); LIST_FOREACH(cur, &me.gs_scheds, glist) { if (strcmp(gsp->gs_name, cur->gs_name) == 0) break; } if (cur != NULL) { G_SCHED_DEBUG(0, "A scheduler named %s already" "exists.", gsp->gs_name); error = EEXIST; } else { LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); gsp->gs_refs = 1; me.gs_sched_count++; g_gsched_build_names(gsp); } mtx_unlock(&me.gs_mtx); return (error); } struct g_gsched_unregparm { struct g_gsched *gup_gsp; int gup_error; }; static void g_gsched_unregister(void *arg, int flag) { struct g_gsched_unregparm *parm = arg; struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; struct g_sched_softc *sc; struct g_geom *gp, *gp_tmp; int error; parm->gup_error = 0; g_topology_assert(); if (flag == EV_CANCEL) return; mtx_lock(&me.gs_mtx); LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { if (gp->class != &g_sched_class) continue; /* Should not happen. */ sc = gp->softc; if (sc->sc_gsched == gsp) { error = g_sched_remove(gp, gsp); if (error) goto failed; } } LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { if (cur != gsp) continue; if (gsp->gs_refs != 1) { G_SCHED_DEBUG(0, "%s still in use.", gsp->gs_name); parm->gup_error = EBUSY; } else { LIST_REMOVE(gsp, glist); me.gs_sched_count--; g_gsched_build_names(gsp); } break; } if (cur == NULL) { G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); parm->gup_error = ENOENT; } failed: mtx_unlock(&me.gs_mtx); } static inline void g_gsched_global_init(void) { if (!me.gs_initialized) { G_SCHED_DEBUG(0, "Initializing global data."); mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); LIST_INIT(&me.gs_scheds); bioq_init(&me.gs_pending); me.gs_initialized = 1; } } /* * Module event called when a scheduling algorithm module is loaded or * unloaded. */ int g_gsched_modevent(module_t mod, int cmd, void *arg) { struct g_gsched *gsp = arg; struct g_gsched_unregparm parm; int error; G_SCHED_DEBUG(0, "Modevent %d.", cmd); /* * If the module is loaded at boot, the geom thread that calls * g_sched_init() might actually run after g_gsched_modevent(), * so make sure that the module is properly initialized. */ g_gsched_global_init(); error = EOPNOTSUPP; switch (cmd) { case MOD_LOAD: error = g_gsched_register(gsp); G_SCHED_DEBUG(0, "Loaded module %s error %d.", gsp->gs_name, error); if (error == 0) g_retaste(&g_sched_class); break; case MOD_UNLOAD: parm.gup_gsp = gsp; parm.gup_error = 0; error = g_waitfor_event(g_gsched_unregister, &parm, M_WAITOK, NULL); if (error == 0) error = parm.gup_error; G_SCHED_DEBUG(0, "Unloaded module %s error %d.", gsp->gs_name, error); break; } return (error); } #ifdef KTR #define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) static inline char g_sched_type(struct bio *bp) { if (bp->bio_cmd == BIO_READ) return ('R'); else if (bp->bio_cmd == BIO_WRITE) return ('W'); return ('U'); } static inline void g_sched_trace_bio_START(struct bio *bp) { CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } static inline void g_sched_trace_bio_DONE(struct bio *bp) { CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } #else /* !KTR */ #define TRC_BIO_EVENT(e, bp) #endif /* !KTR */ /* * g_sched_done() and g_sched_start() dispatch the geom requests to * the scheduling algorithm in use. */ static void g_sched_done(struct bio *bio) { struct g_geom *gp = bio->bio_caller2; struct g_sched_softc *sc = gp->softc; TRC_BIO_EVENT(DONE, bio); KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); g_sched_lock(gp); g_sched_update_stats(bio); sc->sc_gsched->gs_done(sc->sc_data, bio); if (!--sc->sc_pending) wakeup(gp); g_sched_flush_classes(gp); g_sched_unlock(gp); g_std_done(bio); } static void g_sched_start(struct bio *bp) { struct g_geom *gp = bp->bio_to->geom; struct g_sched_softc *sc = gp->softc; struct bio *cbp; TRC_BIO_EVENT(START, bp); G_SCHED_LOGREQ(bp, "Request received."); cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_sched_done; cbp->bio_to = LIST_FIRST(&gp->provider); KASSERT(cbp->bio_to != NULL, ("NULL provider")); /* We only schedule reads and writes. */ if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE) goto bypass; G_SCHED_LOGREQ(cbp, "Sending request."); g_sched_lock(gp); /* * Call the algorithm's gs_start to queue the request in the * scheduler. If gs_start fails then pass the request down, * otherwise call g_sched_dispatch() which tries to push * one or more requests down. */ if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || sc->sc_gsched->gs_start(sc->sc_data, cbp)) { g_sched_unlock(gp); goto bypass; } /* * We use bio_caller1 to mark requests that are scheduled * so make sure it is not NULL. */ if (cbp->bio_caller1 == NULL) cbp->bio_caller1 = &me; /* anything not NULL */ cbp->bio_caller2 = gp; sc->sc_pending++; /* Update general stats. */ me.gs_in_flight++; me.gs_requests++; me.gs_bytes_in_flight += bp->bio_length; if (bp->bio_cmd == BIO_WRITE) { me.gs_writes_in_flight++; me.gs_write_bytes_in_flight += bp->bio_length; } g_sched_dispatch(gp); g_sched_unlock(gp); return; bypass: cbp->bio_done = g_std_done; cbp->bio_caller1 = NULL; /* not scheduled */ g_io_request(cbp, LIST_FIRST(&gp->consumer)); } /* * The next few functions are the geom glue. */ static void g_sched_orphan(struct g_consumer *cp) { g_topology_assert(); g_sched_destroy(cp->geom, 1); } static int g_sched_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_sched_temporary_start(struct bio *bio) { mtx_lock(&me.gs_mtx); me.gs_npending++; bioq_disksort(&me.gs_pending, bio); mtx_unlock(&me.gs_mtx); } static void g_sched_flush_pending(g_start_t *start) { struct bio *bp; while ((bp = bioq_takefirst(&me.gs_pending))) start(bp); } static int g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) { struct g_sched_softc *sc = gp->softc; g_start_t *saved_start, *flush = g_sched_start; int error = 0, endticks = ticks + hz; g_cancel_event(newpp); /* prevent taste() */ /* copy private fields */ newpp->private = pp->private; newpp->index = pp->index; /* Queue all the early requests coming for us. */ me.gs_npending = 0; saved_start = pp->geom->start; dstgp->start = g_sched_temporary_start; while (pp->nstart - pp->nend != me.gs_npending && endticks - ticks >= 0) tsleep(pp, PRIBIO, "-", hz/10); if (pp->nstart - pp->nend != me.gs_npending) { flush = saved_start; error = ETIMEDOUT; goto fail; } /* link pp to this geom */ LIST_REMOVE(pp, provider); pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); /* * replicate the counts from the parent in the * new provider and consumer nodes */ cp->acr = newpp->acr = pp->acr; cp->acw = newpp->acw = pp->acw; cp->ace = newpp->ace = pp->ace; sc->sc_flags |= G_SCHED_PROXYING; fail: dstgp->start = saved_start; g_sched_flush_pending(flush); return (error); } /* * Create a geom node for the device passed as *pp. * If successful, add a reference to this gsp. */ static int g_sched_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp, int proxy) { struct g_sched_softc *sc = NULL; struct g_geom *gp, *dstgp; struct g_provider *newpp = NULL; struct g_consumer *cp = NULL; char name[64]; int error; g_topology_assert(); snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Geom %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_gsched = gsp; sc->sc_data = gsp->gs_init(gp); if (sc->sc_data == NULL) { error = ENOMEM; goto fail; } sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); /* * Do not initialize the flush mechanism, will be initialized * on the first insertion on the hash table. */ mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); gp->softc = sc; gp->start = g_sched_start; gp->orphan = g_sched_orphan; gp->access = g_sched_access; gp->dumpconf = g_sched_dumpconf; newpp = g_new_providerf(dstgp, "%s", gp->name); newpp->mediasize = pp->mediasize; newpp->sectorsize = pp->sectorsize; cp = g_new_consumer(gp); error = g_attach(cp, proxy ? newpp : pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } g_error_provider(newpp, 0); if (proxy) { error = g_insert_proxy(gp, newpp, dstgp, pp, cp); if (error) goto fail; } G_SCHED_DEBUG(0, "Device %s created.", gp->name); g_gsched_ref(gsp); return (0); fail: if (cp != NULL) { if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (newpp != NULL) g_destroy_provider(newpp); if (sc->sc_hash) g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); if (sc->sc_data) gsp->gs_fini(sc->sc_data); g_free(gp->softc); g_destroy_geom(gp); return (error); } /* * Support for dynamic switching of scheduling algorithms. * First initialize the data structures for the new algorithm, * then call g_sched_remove_locked() to flush all references * to the old one, finally link the new algorithm. */ static int g_sched_change_algo(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp) { struct g_sched_softc *sc; struct g_geom *gp; struct g_hash *newh; void *data; u_long mask; int error = 0; gp = pp->geom; sc = gp->softc; data = gsp->gs_init(gp); if (data == NULL) return (ENOMEM); newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); if (gsp->gs_priv_size && !newh) { error = ENOMEM; goto fail; } g_sched_lock(gp); if (sc->sc_gsched) { /* can be NULL in some cases */ error = g_sched_remove_locked(gp, sc->sc_gsched); if (error) goto fail; } g_gsched_ref(gsp); sc->sc_gsched = gsp; sc->sc_data = data; sc->sc_hash = newh; sc->sc_mask = mask; g_sched_unlock(gp); return (0); fail: if (newh) g_sched_hash_fini(gp, newh, mask, gsp, data); if (data) gsp->gs_fini(data); g_sched_unlock(gp); return (error); } /* * Stop the request flow directed to the proxy, redirecting the new * requests to the me.gs_pending queue. */ static struct g_provider * g_detach_proxy(struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp, *newpp; do { pp = LIST_FIRST(&gp->provider); if (pp == NULL) break; cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; me.gs_npending = 0; pp->geom->start = g_sched_temporary_start; return (pp); } while (0); printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); return (NULL); } static void g_sched_blackhole(struct bio *bp) { g_io_deliver(bp, ENXIO); } static inline void g_reparent_provider(struct g_provider *pp, struct g_geom *gp, struct g_provider *newpp) { LIST_REMOVE(pp, provider); if (newpp) { pp->private = newpp->private; pp->index = newpp->index; } pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); } static inline void g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) { struct g_geom *gp = oldpp->geom; g_reparent_provider(oldpp, newpp->geom, newpp); /* * Hackish: let the system destroy the old provider for us, just * in case someone attached a consumer to it, in which case a * direct call to g_destroy_provider() would not work. */ g_reparent_provider(newpp, gp, NULL); } /* * Complete the proxy destruction, linking the old provider to its * original geom, and destroying the proxy provider. Also take care * of issuing the pending requests collected in me.gs_pending (if any). */ static int g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) { struct g_consumer *cp; struct g_provider *newpp; do { cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; /* Relink the provider to its original geom. */ g_unproxy_provider(oldpp, newpp); /* Detach consumer from provider, and destroy provider. */ cp->acr = newpp->acr = 0; cp->acw = newpp->acw = 0; cp->ace = newpp->ace = 0; g_detach(cp); /* Send the pending bios through the right start function. */ g_sched_flush_pending(oldpp->geom->start); return (0); } while (0); printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); /* We cannot send the pending bios anywhere... */ g_sched_flush_pending(g_sched_blackhole); return (EINVAL); } static int g_sched_destroy(struct g_geom *gp, boolean_t force) { struct g_provider *pp, *oldpp = NULL; struct g_sched_softc *sc; struct g_gsched *gsp; int error; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); if (!(sc->sc_flags & G_SCHED_PROXYING)) { pp = LIST_FIRST(&gp->provider); if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { const char *msg = force ? "but we force removal" : "cannot remove"; G_SCHED_DEBUG(!force, "Device %s is still open (r%dw%de%d), %s.", pp->name, pp->acr, pp->acw, pp->ace, msg); if (!force) return (EBUSY); } else { G_SCHED_DEBUG(0, "Device %s removed.", gp->name); } } else oldpp = g_detach_proxy(gp); gsp = sc->sc_gsched; if (gsp) { /* * XXX bad hack here: force a dispatch to release * any reference to the hash table still held by * the scheduler. */ g_sched_lock(gp); /* * We are dying here, no new requests should enter * the scheduler. This is granted by the topolgy, * either in case we were proxying (new bios are * being redirected) or not (see the access check * above). */ g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) { /* * Not all the requests came home: this might happen * under heavy load, or if we were waiting for any * bio which is served in the event path (see * geom_slice.c for an example of how this can * happen). Try to restore a working configuration * if we can fail. */ if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { g_sched_flush_pending(force ? g_sched_blackhole : g_sched_start); } /* * In the forced destroy case there is not so much * we can do, we have pending bios that will call * g_sched_done() somehow, and we don't want them * to crash the system using freed memory. We tell * the user that something went wrong, and leak some * memory here. * Note: the callers using force = 1 ignore the * return value. */ if (force) { G_SCHED_DEBUG(0, "Pending requests while " " destroying geom, some memory leaked."); } return (error); } g_sched_unlock(gp); g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; gsp->gs_fini(sc->sc_data); g_gsched_unref(gsp); sc->sc_gsched = NULL; } else error = 0; if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { error = g_destroy_proxy(gp, oldpp); if (error) { if (force) { G_SCHED_DEBUG(0, "Unrecoverable error while " "destroying a proxy geom, leaking some " " memory."); } return (error); } } mtx_destroy(&sc->sc_mtx); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (error); } static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_sched_destroy(gp, 0)); } /* * Functions related to the classification of requests. * * On recent FreeBSD versions (8.0 and above), we store a reference * to the issuer of a request in bp->bio_classifier1 as soon * as the bio is posted to the geom queue (and not later, because * requests are managed by the g_down thread afterwards). */ /* * Classifier support for recent FreeBSD versions: we use * a very simple classifier, only use curthread to tag a request. * The classifier is registered at module load, and unregistered * at module unload. */ static int g_sched_tag(void *arg, struct bio *bp) { bp->bio_classifier1 = curthread; return (1); } static struct g_classifier_hook g_sched_classifier = { .func = g_sched_tag, }; static inline void g_classifier_ini(void) { g_register_classifier(&g_sched_classifier); } static inline void g_classifier_fini(void) { g_unregister_classifier(&g_sched_classifier); } static void g_sched_init(struct g_class *mp) { g_gsched_global_init(); G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", mp, &g_sched_class); /* Patch g_io_request to store classification info in the bio. */ g_classifier_ini(); } static void g_sched_fini(struct g_class *mp) { g_classifier_fini(); G_SCHED_DEBUG(0, "Unloading..."); KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); mtx_destroy(&me.gs_mtx); } static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_geom *gp; cp = LIST_FIRST(&pp->geom->consumer); if (cp == NULL) return (ENOIOCTL); gp = cp->provider->geom; if (gp->ioctl == NULL) return (ENOIOCTL); return (gp->ioctl(cp->provider, cmd, data, fflag, td)); } /* * Read the i-th argument for a request, skipping the /dev/ * prefix if present. */ static const char * g_sched_argi(struct gctl_req *req, int i) { static const char *dev_prefix = "/dev/"; const char *name; char param[16]; int l = strlen(dev_prefix); snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) gctl_error(req, "No 'arg%d' argument", i); else if (strncmp(name, dev_prefix, l) == 0) name += l; return (name); } /* * Fetch nargs and do appropriate checks. */ static int g_sched_get_nargs(struct gctl_req *req) { int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No 'nargs' argument"); return (0); } if (*nargs <= 0) gctl_error(req, "Missing device(s)."); return (*nargs); } /* * Check whether we should add the class on certain volumes when * this geom is created. Right now this is under control of a kenv * variable containing the names of all devices that we care about. * Probably we should only support transparent insertion as the * preferred mode of operation. */ static struct g_geom * g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_gsched *gsp = NULL; /* the . algorithm we want */ const char *s; /* generic string pointer */ const char *taste_names; /* devices we like */ int l; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_SCHED_DEBUG(2, "Tasting %s.", pp->name); do { /* do not taste on ourselves */ if (pp->geom->class == mp) break; taste_names = kern_getenv("geom.sched.taste"); if (taste_names == NULL) break; l = strlen(pp->name); for (s = taste_names; *s && (s = strstr(s, pp->name)); s++) { /* further checks for an exact match */ if ( (s == taste_names || s[-1] == ' ') && (s[l] == '\0' || s[l] == ' ') ) break; } if (s == NULL) break; G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", pp->name, s); /* look up the provider name in the list */ s = kern_getenv("geom.sched.algo"); if (s == NULL) s = "rr"; gsp = g_gsched_find(s); /* also get a reference */ if (gsp == NULL) { G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); break; } /* XXX create with 1 as last argument ? */ g_sched_create(NULL, mp, pp, gsp, 0); g_gsched_unref(gsp); } while (0); return NULL; } static void g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_create(req, mp, pp, gsp, proxy) != 0) break; } g_gsched_unref(gsp); } static void g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_change_algo(req, mp, pp, gsp) != 0) break; } g_gsched_unref(gsp); } static struct g_geom * g_sched_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int nargs, *force, error, i; struct g_geom *gp; const char *name; g_topology_assert(); nargs = g_sched_get_nargs(req); force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; gp = g_sched_find_geom(mp, name); if (gp == NULL) { G_SCHED_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); break; } error = g_sched_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); break; } } } static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SCHED_VERSION) { gctl_error(req, "Userland and kernel parts are " "out of sync."); return; } if (strcmp(verb, "create") == 0) { g_sched_ctl_create(req, mp, 0); return; } else if (strcmp(verb, "insert") == 0) { g_sched_ctl_create(req, mp, 1); return; } else if (strcmp(verb, "configure") == 0) { g_sched_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_sched_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; if (indent == NULL) { /* plaintext */ sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); } if (gsp != NULL && gsp->gs_dumpconf) gsp->gs_dumpconf(sb, indent, gp, cp, pp); } DECLARE_GEOM_CLASS(g_sched_class, g_sched); MODULE_VERSION(geom_sched, 0); Index: head/sys/geom/sched/g_sched.h =================================================================== --- head/sys/geom/sched/g_sched.h (revision 326269) +++ head/sys/geom/sched/g_sched.h (revision 326270) @@ -1,125 +1,127 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _G_SCHED_H_ #define _G_SCHED_H_ /* * $Id$ * $FreeBSD$ * * Header for the geom_sched class (userland library and kernel part). * See g_sched.c for documentation. * The userland code only needs the three G_SCHED_* values below. */ #define G_SCHED_CLASS_NAME "SCHED" #define G_SCHED_VERSION 0 #define G_SCHED_SUFFIX ".sched." #ifdef _KERNEL #define G_SCHED_DEBUG(lvl, ...) do { \ if (me.gs_debug >= (lvl)) { \ printf("GEOM_SCHED"); \ if (me.gs_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_SCHED_LOGREQ(bp, ...) do { \ if (me.gs_debug >= 2) { \ printf("GEOM_SCHED[2]: "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) LIST_HEAD(g_hash, g_sched_class); /* * Descriptor of a scheduler. * In addition to the obvious fields, sc_flushing and sc_pending * support dynamic switching of scheduling algorithm. * Normally, sc_flushing is 0, and requests that are scheduled are * also added to the sc_pending queue, and removed when we receive * the 'done' event. * * When we are transparently inserted on an existing provider, * sc_proxying is set. The detach procedure is slightly different. * * When switching schedulers, sc_flushing is set so requests bypass us, * and at the same time we update the pointer in the pending bios * to ignore us when they return up. * XXX it would be more efficient to implement sc_pending with * a generation number: the softc generation is increased when * we change scheduling algorithm, we store the current generation * number in the pending bios, and when they come back we ignore * the done() call if the generation number do not match. */ struct g_sched_softc { /* * Generic fields used by any scheduling algorithm: * a mutex, the class descriptor, flags, list of pending * requests (used when flushing the module) and support * for hash tables where we store per-flow queues. */ struct mtx sc_mtx; struct g_gsched *sc_gsched; /* Scheduler descriptor. */ int sc_pending; /* Pending requests. */ int sc_flags; /* Various flags. */ /* * Hash tables to store per-flow queues are generally useful * so we handle them in the common code. * sc_hash and sc_mask are parameters of the hash table, * the last two fields are used to periodically remove * expired items from the hash table. */ struct g_hash *sc_hash; u_long sc_mask; int sc_flush_ticks; /* Next tick for a flush. */ int sc_flush_bucket; /* Next bucket to flush. */ /* * Pointer to the algorithm's private data, which is the value * returned by sc_gsched->gs_init() . A NULL here means failure. * XXX intptr_t might be more appropriate. */ void *sc_data; }; #define G_SCHED_PROXYING 1 #define G_SCHED_FLUSHING 2 #endif /* _KERNEL */ #endif /* _G_SCHED_H_ */ Index: head/sys/geom/sched/gs_rr.c =================================================================== --- head/sys/geom/sched/gs_rr.c (revision 326269) +++ head/sys/geom/sched/gs_rr.c (revision 326270) @@ -1,699 +1,701 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * A round-robin (RR) anticipatory scheduler, with per-client queues. * * The goal of this implementation is to improve throughput compared * to the pure elevator algorithm, and insure some fairness among * clients. * * Requests coming from the same client are put in the same queue. * We use anticipation to help reducing seeks, and each queue * is never served continuously for more than a given amount of * time or data. Queues are then served in a round-robin fashion. * * Each queue can be in any of the following states: * READY immediately serve the first pending request; * BUSY one request is under service, wait for completion; * IDLING do not serve incoming requests immediately, unless * they are "eligible" as defined later. * * Scheduling is made looking at the status of all queues, * and the first one in round-robin order is privileged. */ #include #include #include #include #include #include #include #include #include #include #include #include "gs_scheduler.h" /* possible states of the scheduler */ enum g_rr_state { G_QUEUE_READY = 0, /* Ready to dispatch. */ G_QUEUE_BUSY, /* Waiting for a completion. */ G_QUEUE_IDLING /* Waiting for a new request. */ }; /* possible queue flags */ enum g_rr_flags { /* G_FLAG_COMPLETED means that the field q_slice_end is valid. */ G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ }; struct g_rr_softc; /* * Queue descriptor, containing reference count, scheduling * state, a queue of pending requests, configuration parameters. * Queues with pending request(s) and not under service are also * stored in a Round Robin (RR) list. */ struct g_rr_queue { struct g_rr_softc *q_sc; /* link to the parent */ enum g_rr_state q_status; unsigned int q_service; /* service received so far */ int q_slice_end; /* actual slice end time, in ticks */ enum g_rr_flags q_flags; /* queue flags */ struct bio_queue_head q_bioq; /* Scheduling parameters */ unsigned int q_budget; /* slice size in bytes */ unsigned int q_slice_duration; /* slice size in ticks */ unsigned int q_wait_ticks; /* wait time for anticipation */ /* Stats to drive the various heuristics. */ struct g_savg q_thinktime; /* Thinktime average. */ struct g_savg q_seekdist; /* Seek distance average. */ int q_bionum; /* Number of requests. */ off_t q_lastoff; /* Last submitted req. offset. */ int q_lastsub; /* Last submitted req. time. */ /* Expiration deadline for an empty queue. */ int q_expire; TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ }; /* List types. */ TAILQ_HEAD(g_rr_tailq, g_rr_queue); /* list of scheduler instances */ LIST_HEAD(g_scheds, g_rr_softc); /* Default quantum for RR between queues. */ #define G_RR_DEFAULT_BUDGET 0x00800000 /* * Per device descriptor, holding the Round Robin list of queues * accessing the disk, a reference to the geom, and the timer. */ struct g_rr_softc { struct g_geom *sc_geom; /* * sc_active is the queue we are anticipating for. * It is set only in gs_rr_next(), and possibly cleared * only in gs_rr_next() or on a timeout. * The active queue is never in the Round Robin list * even if it has requests queued. */ struct g_rr_queue *sc_active; struct callout sc_wait; /* timer for sc_active */ struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ int sc_nqueues; /* number of queues */ /* Statistics */ int sc_in_flight; /* requests in the driver */ LIST_ENTRY(g_rr_softc) sc_next; }; /* Descriptor for bounded values, min and max are constant. */ struct x_bound { const int x_min; int x_cur; const int x_max; }; /* * parameters, config and stats */ struct g_rr_params { int queues; /* total number of queues */ int w_anticipate; /* anticipate writes */ int bypass; /* bypass scheduling writes */ int units; /* how many instances */ /* sc_head is used for debugging */ struct g_scheds sc_head; /* first scheduler instance */ struct x_bound queue_depth; /* max parallel requests */ struct x_bound wait_ms; /* wait time, milliseconds */ struct x_bound quantum_ms; /* quantum size, milliseconds */ struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ /* statistics */ int wait_hit; /* success in anticipation */ int wait_miss; /* failure in anticipation */ }; /* * Default parameters for the scheduler. The quantum sizes target * a 80MB/s disk; if the hw is faster or slower the minimum of the * two will have effect: the clients will still be isolated but * the fairness may be limited. A complete solution would involve * the on-line measurement of the actual disk throughput to derive * these parameters. Or we may just choose to ignore service domain * fairness and accept what can be achieved with time-only budgets. */ static struct g_rr_params me = { .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), .w_anticipate = 1, .queue_depth = { 1, 1, 50 }, .wait_ms = { 1, 10, 30 }, .quantum_ms = { 1, 100, 500 }, .quantum_kb = { 16, 8192, 65536 }, }; struct g_rr_params *gs_rr_me = &me; SYSCTL_DECL(_kern_geom_sched); static SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, "GEOM_SCHED ROUND ROBIN stuff"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, &me.units, 0, "Scheduler instances"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, &me.queues, 0, "Total rr queues"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, &me.wait_ms.x_cur, 0, "Wait time milliseconds"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, &me.bypass, 0, "Bypass scheduler"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, &me.w_anticipate, 0, "Do anticipation on writes"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, &me.wait_hit, 0, "Hits in anticipation"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, &me.wait_miss, 0, "Misses in anticipation"); #ifdef DEBUG_QUEUES /* print the status of a queue */ static void gs_rr_dump_q(struct g_rr_queue *qp, int index) { int l = 0; struct bio *bp; TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { l++; } printf("--- rr queue %d %p status %d len %d ---\n", index, qp, qp->q_status, l); } /* * Dump the scheduler status when writing to this sysctl variable. * XXX right now we only dump the status of the last instance created. * not a severe issue because this is only for debugging */ static int gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) { int error, val = 0; struct g_rr_softc *sc; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); printf("called %s\n", __FUNCTION__); LIST_FOREACH(sc, &me.sc_head, sc_next) { int i, tot = 0; printf("--- sc %p active %p nqueues %d " "callout %d in_flight %d ---\n", sc, sc->sc_active, sc->sc_nqueues, callout_active(&sc->sc_wait), sc->sc_in_flight); for (i = 0; i < G_RR_HASH_SIZE; i++) { struct g_rr_queue *qp; LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { gs_rr_dump_q(qp, tot); tot++; } } } return (0); } SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); #endif /* DEBUG_QUEUES */ /* * Get a bounded value, optionally convert to a min of t_min ticks. */ static int get_bounded(struct x_bound *v, int t_min) { int x; x = v->x_cur; if (x < v->x_min) x = v->x_min; else if (x > v->x_max) x = v->x_max; if (t_min) { x = x * hz / 1000; /* convert to ticks */ if (x < t_min) x = t_min; } return x; } /* * Get a reference to the queue for bp, using the generic * classification mechanism. */ static struct g_rr_queue * g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) { return (g_sched_get_class(sc->sc_geom, bp)); } static int g_rr_init_class(void *data, void *priv) { struct g_rr_softc *sc = data; struct g_rr_queue *qp = priv; bioq_init(&qp->q_bioq); /* * Set the initial parameters for the client: * slice size in bytes and ticks, and wait ticks. * Right now these are constant, but we could have * autoconfiguration code to adjust the values based on * the actual workload. */ qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); qp->q_sc = sc; /* link to the parent */ qp->q_sc->sc_nqueues++; me.queues++; return (0); } /* * Release a reference to the queue. */ static void g_rr_queue_put(struct g_rr_queue *qp) { g_sched_put_class(qp->q_sc->sc_geom, qp); } static void g_rr_fini_class(void *data, void *priv) { struct g_rr_queue *qp = priv; KASSERT(bioq_first(&qp->q_bioq) == NULL, ("released nonempty queue")); qp->q_sc->sc_nqueues--; me.queues--; } static inline int g_rr_queue_expired(struct g_rr_queue *qp) { if (qp->q_service >= qp->q_budget) return (1); if ((qp->q_flags & G_FLAG_COMPLETED) && ticks - qp->q_slice_end >= 0) return (1); return (0); } static inline int g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) { int wait = get_bounded(&me.wait_ms, 2); if (!me.w_anticipate && (bp->bio_cmd == BIO_WRITE)) return (0); if (g_savg_valid(&qp->q_thinktime) && g_savg_read(&qp->q_thinktime) > wait) return (0); if (g_savg_valid(&qp->q_seekdist) && g_savg_read(&qp->q_seekdist) > 8192) return (0); return (1); } /* * Called on a request arrival, timeout or completion. * Try to serve a request among those queued. */ static struct bio * g_rr_next(void *data, int force) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; struct bio *bp, *next; int expired; qp = sc->sc_active; if (me.bypass == 0 && !force) { if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) return (NULL); /* Try with the queue under service first. */ if (qp != NULL && qp->q_status != G_QUEUE_READY) { /* * Queue is anticipating, ignore request. * We should check that we are not past * the timeout, but in that case the timeout * will fire immediately afterwards so we * don't bother. */ return (NULL); } } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { g_rr_queue_put(qp); sc->sc_active = qp = NULL; } /* * No queue under service, look for the first in RR order. * If we find it, select if as sc_active, clear service * and record the end time of the slice. */ if (qp == NULL) { qp = TAILQ_FIRST(&sc->sc_rr_tailq); if (qp == NULL) return (NULL); /* no queues at all, return */ /* otherwise select the new queue for service. */ TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); sc->sc_active = qp; qp->q_service = 0; qp->q_flags &= ~G_FLAG_COMPLETED; } bp = bioq_takefirst(&qp->q_bioq); /* surely not NULL */ qp->q_service += bp->bio_length; /* charge the service */ /* * The request at the head of the active queue is always * dispatched, and gs_rr_next() will be called again * immediately. * We need to prepare for what to do next: * * 1. have we reached the end of the (time or service) slice ? * If so, clear sc_active and possibly requeue the previous * active queue if it has more requests pending; * 2. do we have more requests in sc_active ? * If yes, do not anticipate, as gs_rr_next() will run again; * if no, decide whether or not to anticipate depending * on read or writes (e.g., anticipate only on reads). */ expired = g_rr_queue_expired(qp); /* are we expired ? */ next = bioq_first(&qp->q_bioq); /* do we have one more ? */ if (expired) { sc->sc_active = NULL; /* Either requeue or release reference. */ if (next != NULL) TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); else g_rr_queue_put(qp); } else if (next != NULL) { qp->q_status = G_QUEUE_READY; } else { if (!force && g_rr_should_anticipate(qp, bp)) { /* anticipate */ qp->q_status = G_QUEUE_BUSY; } else { /* do not anticipate, release reference */ g_rr_queue_put(qp); sc->sc_active = NULL; } } /* If sc_active != NULL, its q_status is always correct. */ sc->sc_in_flight++; return (bp); } static inline void g_rr_update_thinktime(struct g_rr_queue *qp) { int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); if (qp->q_sc->sc_active != qp) return; qp->q_lastsub = ticks; delta = (delta > 2 * wait) ? 2 * wait : delta; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_thinktime, delta); } static inline void g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) { off_t dist; if (qp->q_lastoff > bp->bio_offset) dist = qp->q_lastoff - bp->bio_offset; else dist = bp->bio_offset - qp->q_lastoff; if (dist > (8192 * 8)) dist = 8192 * 8; qp->q_lastoff = bp->bio_offset + bp->bio_length; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_seekdist, dist); } /* * Called when a real request for disk I/O arrives. * Locate the queue associated with the client. * If the queue is the one we are anticipating for, reset its timeout; * if the queue is not in the round robin list, insert it in the list. * On any error, do not queue the request and return -1, the caller * will take care of this request. */ static int g_rr_start(void *data, struct bio *bp) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; if (me.bypass) return (-1); /* bypass the scheduler */ /* Get the queue for the request. */ qp = g_rr_queue_get(sc, bp); if (qp == NULL) return (-1); /* allocation failed, tell upstream */ if (bioq_first(&qp->q_bioq) == NULL) { /* * We are inserting into an empty queue. * Reset its state if it is sc_active, * otherwise insert it in the RR list. */ if (qp == sc->sc_active) { qp->q_status = G_QUEUE_READY; callout_stop(&sc->sc_wait); } else { g_sched_priv_ref(qp); TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); } } qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); g_rr_update_thinktime(qp); g_rr_update_seekdist(qp, bp); /* Inherit the reference returned by g_rr_queue_get(). */ bp->bio_caller1 = qp; bioq_disksort(&qp->q_bioq, bp); return (0); } /* * Callout executed when a queue times out anticipating a new request. */ static void g_rr_wait_timeout(void *data) { struct g_rr_softc *sc = data; struct g_geom *geom = sc->sc_geom; g_sched_lock(geom); /* * We can race with other events, so check if * sc_active is still valid. */ if (sc->sc_active != NULL) { /* Release the reference to the queue. */ g_rr_queue_put(sc->sc_active); sc->sc_active = NULL; me.wait_hit--; me.wait_miss++; /* record the miss */ } g_sched_dispatch(geom); g_sched_unlock(geom); } /* * Module glue: allocate descriptor, initialize its fields. */ static void * g_rr_init(struct g_geom *geom) { struct g_rr_softc *sc; /* XXX check whether we can sleep */ sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); sc->sc_geom = geom; TAILQ_INIT(&sc->sc_rr_tailq); callout_init(&sc->sc_wait, 1); LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); me.units++; return (sc); } /* * Module glue -- drain the callout structure, destroy the * hash table and its element, and free the descriptor. */ static void g_rr_fini(void *data) { struct g_rr_softc *sc = data; callout_drain(&sc->sc_wait); KASSERT(sc->sc_active == NULL, ("still a queue under service")); KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); LIST_REMOVE(sc, sc_next); me.units--; free(sc, M_GEOM_SCHED); } /* * Called when the request under service terminates. * Start the anticipation timer if needed. */ static void g_rr_done(void *data, struct bio *bp) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; sc->sc_in_flight--; qp = bp->bio_caller1; /* * When the first request for this queue completes, update the * duration and end of the slice. We do not do it when the * slice starts to avoid charging to the queue the time for * the first seek. */ if (!(qp->q_flags & G_FLAG_COMPLETED)) { qp->q_flags |= G_FLAG_COMPLETED; /* * recompute the slice duration, in case we want * to make it adaptive. This is not used right now. * XXX should we do the same for q_quantum and q_wait_ticks ? */ qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_slice_end = ticks + qp->q_slice_duration; } if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { /* The queue is trying anticipation, start the timer. */ qp->q_status = G_QUEUE_IDLING; /* may make this adaptive */ qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); me.wait_hit++; callout_reset(&sc->sc_wait, qp->q_wait_ticks, g_rr_wait_timeout, sc); } else g_sched_dispatch(sc->sc_geom); /* Release a reference to the queue. */ g_rr_queue_put(qp); } static void g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { if (indent == NULL) { /* plaintext */ sbuf_printf(sb, " units %d queues %d", me.units, me.queues); } } static struct g_gsched g_rr = { .gs_name = "rr", .gs_priv_size = sizeof(struct g_rr_queue), .gs_init = g_rr_init, .gs_fini = g_rr_fini, .gs_start = g_rr_start, .gs_done = g_rr_done, .gs_next = g_rr_next, .gs_dumpconf = g_rr_dumpconf, .gs_init_class = g_rr_init_class, .gs_fini_class = g_rr_fini_class, }; DECLARE_GSCHED_MODULE(rr, &g_rr); Index: head/sys/geom/sched/gs_scheduler.h =================================================================== --- head/sys/geom/sched/gs_scheduler.h (revision 326269) +++ head/sys/geom/sched/gs_scheduler.h (revision 326270) @@ -1,237 +1,239 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * Prototypes for GEOM-based disk scheduling algorithms. * See g_sched.c for generic documentation. * * This file is used by the kernel modules implementing the various * scheduling algorithms. They should provide all the methods * defined in struct g_gsched, and also invoke the macro * DECLARE_GSCHED_MODULE * which registers the scheduling algorithm with the geom_sched module. * * The various scheduling algorithms do not need to know anything * about geom, they only need to handle the 'bio' requests they * receive, pass them down when needed, and use the locking interface * defined below. */ #ifndef _G_GSCHED_H_ #define _G_GSCHED_H_ #ifdef _KERNEL #include #include #include #include #include #include #include "g_sched.h" /* * This is the interface exported to scheduling modules. * * gs_init() is called when our scheduling algorithm * starts being used by a geom 'sched' * * gs_fini() is called when the algorithm is released. * * gs_start() is called when a new request comes in. It should * enqueue the request and return 0 if success, or return non-zero * in case of failure (meaning the request is passed down). * The scheduler can use bio->bio_caller1 to store a non-null * pointer meaning the request is under its control. * * gs_next() is called in a loop by g_sched_dispatch(), right after * gs_start(), or on timeouts or 'done' events. It should return * immediately, either a pointer to the bio to be served or NULL * if no bio should be served now. If force is specified, a * work-conserving behavior is expected. * * gs_done() is called when a request under service completes. * In turn the scheduler may decide to call the dispatch loop * to serve other pending requests (or make sure there is a pending * timeout to avoid stalls). * * gs_init_class() is called when a new client (as determined by * the classifier) starts being used. * * gs_hash_unref() is called right before the class hashtable is * destroyed; after this call, the scheduler is supposed to hold no * more references to the elements in the table. */ /* Forward declarations for prototypes. */ struct g_geom; struct g_sched_class; typedef void *gs_init_t (struct g_geom *geom); typedef void gs_fini_t (void *data); typedef int gs_start_t (void *data, struct bio *bio); typedef void gs_done_t (void *data, struct bio *bio); typedef struct bio *gs_next_t (void *data, int force); typedef int gs_init_class_t (void *data, void *priv); typedef void gs_fini_class_t (void *data, void *priv); typedef void gs_hash_unref_t (void *data); struct g_gsched { const char *gs_name; int gs_refs; int gs_priv_size; gs_init_t *gs_init; gs_fini_t *gs_fini; gs_start_t *gs_start; gs_done_t *gs_done; gs_next_t *gs_next; g_dumpconf_t *gs_dumpconf; gs_init_class_t *gs_init_class; gs_fini_class_t *gs_fini_class; gs_hash_unref_t *gs_hash_unref; LIST_ENTRY(g_gsched) glist; }; #define KTR_GSCHED KTR_SPARE4 MALLOC_DECLARE(M_GEOM_SCHED); /* * Basic classification mechanism. Each request is associated to * a g_sched_class, and each scheduler has the opportunity to set * its own private data for the given (class, geom) pair. The * private data have a base type of g_sched_private, and are * extended at the end with the actual private fields of each * scheduler. */ struct g_sched_class { int gsc_refs; int gsc_expire; u_long gsc_key; LIST_ENTRY(g_sched_class) gsc_clist; void *gsc_priv[0]; }; /* * Manipulate the classifier's data. g_sched_get_class() gets a reference * to the class corresponding to bp in gp, allocating and initializing * it if necessary. g_sched_put_class() releases the reference. * The returned value points to the private data for the class. */ void *g_sched_get_class(struct g_geom *gp, struct bio *bp); void g_sched_put_class(struct g_geom *gp, void *priv); static inline struct g_sched_class * g_sched_priv2class(void *priv) { return ((struct g_sched_class *)((u_long)priv - offsetof(struct g_sched_class, gsc_priv))); } static inline void g_sched_priv_ref(void *priv) { struct g_sched_class *gsc; gsc = g_sched_priv2class(priv); gsc->gsc_refs++; } /* * Locking interface. When each operation registered with the * scheduler is invoked, a per-instance lock is taken to protect * the data associated with it. If the scheduler needs something * else to access the same data (e.g., a callout) it must use * these functions. */ void g_sched_lock(struct g_geom *gp); void g_sched_unlock(struct g_geom *gp); /* * Restart request dispatching. Must be called with the per-instance * mutex held. */ void g_sched_dispatch(struct g_geom *geom); /* * Simple gathering of statistical data, used by schedulers to collect * info on process history. Just keep an exponential average of the * samples, with some extra bits of precision. */ struct g_savg { uint64_t gs_avg; unsigned int gs_smpl; }; static inline void g_savg_add_sample(struct g_savg *ss, uint64_t sample) { /* EMA with alpha = 0.125, fixed point, 3 bits of precision. */ ss->gs_avg = sample + ss->gs_avg - (ss->gs_avg >> 3); ss->gs_smpl = 1 + ss->gs_smpl - (ss->gs_smpl >> 3); } static inline int g_savg_valid(struct g_savg *ss) { /* We want at least 8 samples to deem an average as valid. */ return (ss->gs_smpl > 7); } static inline uint64_t g_savg_read(struct g_savg *ss) { return (ss->gs_avg / ss->gs_smpl); } /* * Declaration of a scheduler module. */ int g_gsched_modevent(module_t mod, int cmd, void *arg); #define DECLARE_GSCHED_MODULE(name, gsched) \ static moduledata_t name##_mod = { \ #name, \ g_gsched_modevent, \ gsched, \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); \ MODULE_DEPEND(name, geom_sched, 0, 0, 0); #endif /* _KERNEL */ #endif /* _G_GSCHED_H_ */ Index: head/sys/geom/shsec/g_shsec.c =================================================================== --- head/sys/geom/shsec/g_shsec.c (revision 326269) +++ head/sys/geom/shsec/g_shsec.c (revision 326270) @@ -1,836 +1,838 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_shsec, "GEOM shared secret device support"); static MALLOC_DEFINE(M_SHSEC, "shsec_data", "GEOM_SHSEC Data"); static uma_zone_t g_shsec_zone; static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force); static int g_shsec_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_shsec_taste; static g_ctl_req_t g_shsec_config; static g_dumpconf_t g_shsec_dumpconf; static g_init_t g_shsec_init; static g_fini_t g_shsec_fini; struct g_class g_shsec_class = { .name = G_SHSEC_CLASS_NAME, .version = G_VERSION, .ctlreq = g_shsec_config, .taste = g_shsec_taste, .destroy_geom = g_shsec_destroy_geom, .init = g_shsec_init, .fini = g_shsec_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, shsec, CTLFLAG_RW, 0, "GEOM_SHSEC stuff"); static u_int g_shsec_debug = 0; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, debug, CTLFLAG_RWTUN, &g_shsec_debug, 0, "Debug level"); static u_int g_shsec_maxmem = MAXPHYS * 100; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_shsec_maxmem, 0, "Maximum memory that can be allocated for I/O (in bytes)"); static u_int g_shsec_alloc_failed = 0; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, alloc_failed, CTLFLAG_RD, &g_shsec_alloc_failed, 0, "How many times I/O allocation failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_shsec_init(struct g_class *mp __unused) { g_shsec_zone = uma_zcreate("g_shsec_zone", MAXPHYS, NULL, NULL, NULL, NULL, 0, 0); g_shsec_maxmem -= g_shsec_maxmem % MAXPHYS; uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / MAXPHYS); } static void g_shsec_fini(struct g_class *mp __unused) { uma_zdestroy(g_shsec_zone); } /* * Return the number of valid disks. */ static u_int g_shsec_nvalid(struct g_shsec_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_shsec_remove_disk(struct g_consumer *cp) { struct g_shsec_softc *sc; u_int no; KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_shsec_softc *)cp->private; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); no = cp->index; G_SHSEC_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); sc->sc_disks[no] = NULL; if (sc->sc_provider != NULL) { g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s removed.", sc->sc_name); } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); } static void g_shsec_orphan(struct g_consumer *cp) { struct g_shsec_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_shsec_remove_disk(cp); /* If there are no valid disks anymore, remove device. */ if (g_shsec_nvalid(sc) == 0) g_shsec_destroy(sc, 1); } static int g_shsec_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2; struct g_shsec_softc *sc; struct g_geom *gp; int error; gp = pp->geom; sc = gp->softc; if (sc == NULL) { /* * It looks like geom is being withered. * In that case we allow only negative requests. */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("Positive access request (device=%s).", pp->name)); if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) { G_SHSEC_DEBUG(0, "Device %s definitely destroyed.", gp->name); } return (0); } /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; error = ENXIO; LIST_FOREACH(cp1, &gp->consumer, consumer) { error = g_access(cp1, dr, dw, de); if (error == 0) continue; /* * If we fail here, backout all previous changes. */ LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) return (error); g_access(cp2, -dr, -dw, -de); } /* NOTREACHED */ } return (error); } static void g_shsec_xor1(uint32_t *src, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) *dst = *dst ^ *src++; KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_done(struct bio *bp) { struct g_shsec_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; if (bp->bio_error == 0) G_SHSEC_LOGREQ(2, bp, "Request done."); else { G_SHSEC_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; } if (pbp->bio_cmd == BIO_READ) { if ((pbp->bio_pflags & G_SHSEC_BFLAG_FIRST) != 0) { bcopy(bp->bio_data, pbp->bio_data, pbp->bio_length); pbp->bio_pflags = 0; } else { g_shsec_xor1((uint32_t *)bp->bio_data, (uint32_t *)pbp->bio_data, (ssize_t)pbp->bio_length); } } bzero(bp->bio_data, bp->bio_length); uma_zfree(g_shsec_zone, bp->bio_data); g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_io_deliver(pbp, pbp->bio_error); } } static void g_shsec_xor2(uint32_t *rand, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) { *rand = arc4random(); *dst = *dst ^ *rand++; } KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_start(struct bio *bp) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_shsec_softc *sc; struct bio *cbp; uint32_t *dst; ssize_t len; u_int no; int error; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_shsec_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_SHSEC_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_FLUSH: /* * Only those requests are supported. */ break; case BIO_DELETE: case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } /* * Allocate all bios first and calculate XOR. */ dst = NULL; len = bp->bio_length; if (bp->bio_cmd == BIO_READ) bp->bio_pflags = G_SHSEC_BFLAG_FIRST; for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_shsec_done; cbp->bio_data = uma_zalloc(g_shsec_zone, M_NOWAIT); if (cbp->bio_data == NULL) { g_shsec_alloc_failed++; error = ENOMEM; goto failure; } cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd == BIO_WRITE) { if (no == 0) { dst = (uint32_t *)cbp->bio_data; bcopy(bp->bio_data, dst, len); } else { g_shsec_xor2((uint32_t *)cbp->bio_data, dst, len); } } } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_SHSEC_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); } return; failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; if (cbp->bio_data != NULL) { bzero(cbp->bio_data, cbp->bio_length); uma_zfree(g_shsec_zone, cbp->bio_data); } g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } static void g_shsec_check_and_run(struct g_shsec_softc *sc) { off_t mediasize, ms; u_int no, sectorsize = 0; if (g_shsec_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "shsec/%s", sc->sc_name); /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; mediasize -= sc->sc_disks[0]->provider->sectorsize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { ms = sc->sc_disks[no]->provider->mediasize; ms -= sc->sc_disks[no]->provider->sectorsize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, sc->sc_disks[no]->provider->sectorsize); } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize; g_error_provider(sc->sc_provider, 0); G_SHSEC_DEBUG(0, "Device %s activated.", sc->sc_name); } static int g_shsec_read_metadata(struct g_consumer *cp, struct g_shsec_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ shsec_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_shsec_add_disk(struct g_shsec_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; struct g_shsec_metadata md; int error; /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } /* Reread metadata. */ error = g_shsec_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_SHSEC_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } cp->private = sc; cp->index = no; sc->sc_disks[no] = cp; G_SHSEC_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_shsec_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_shsec_create(struct g_class *mp, const struct g_shsec_metadata *md) { struct g_shsec_softc *sc; struct g_geom *gp; u_int no; G_SHSEC_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_SHSEC_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_SHSEC_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_SHSEC, M_WAITOK | M_ZERO); gp->start = g_shsec_start; gp->spoiled = g_shsec_orphan; gp->orphan = g_shsec_orphan; gp->access = g_shsec_access; gp->dumpconf = g_shsec_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_SHSEC, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force) { struct g_provider *pp; struct g_geom *gp; u_int no; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_SHSEC_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_SHSEC_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } for (no = 0; no < sc->sc_ndisks; no++) { if (sc->sc_disks[no] != NULL) g_shsec_remove_disk(sc->sc_disks[no]); } gp = sc->sc_geom; gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_SHSEC); free(sc, M_SHSEC); pp = LIST_FIRST(&gp->provider); if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) G_SHSEC_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_shsec_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_shsec_softc *sc; sc = gp->softc; return (g_shsec_destroy(sc, 0)); } static struct g_geom * g_shsec_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_shsec_metadata md; struct g_shsec_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_SHSEC_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "shsec:taste"); gp->start = g_shsec_start; gp->access = g_shsec_access; gp->orphan = g_shsec_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_shsec_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0) return (NULL); if (md.md_version > G_SHSEC_VERSION) { G_SHSEC_DEBUG(0, "Kernel module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 1) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_shsec_create(mp, &md); if (gp == NULL) { G_SHSEC_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_shsec_destroy(sc, 1); return (NULL); } } return (gp); } static struct g_shsec_softc * g_shsec_find_device(struct g_class *mp, const char *name) { struct g_shsec_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_shsec_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_shsec_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_shsec_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_shsec_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_shsec_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SHSEC_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "stop") == 0) { g_shsec_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_shsec_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_shsec_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_shsec_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_printf(sb, "UP"); else sbuf_printf(sb, "DOWN"); sbuf_printf(sb, "\n"); } } DECLARE_GEOM_CLASS(g_shsec_class, g_shsec); Index: head/sys/geom/shsec/g_shsec.h =================================================================== --- head/sys/geom/shsec/g_shsec.h (revision 326269) +++ head/sys/geom/shsec/g_shsec.h (revision 326270) @@ -1,117 +1,119 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_SHSEC_H_ #define _G_SHSEC_H_ #include #define G_SHSEC_CLASS_NAME "SHSEC" #define G_SHSEC_MAGIC "GEOM::SHSEC" /* * Version history: * 0 - Initial version number. * 1 - Added md_provsize field to metadata. */ #define G_SHSEC_VERSION 1 #ifdef _KERNEL #define G_SHSEC_BFLAG_FIRST 0x1 #define G_SHSEC_DEBUG(lvl, ...) do { \ if (g_shsec_debug >= (lvl)) { \ printf("GEOM_SHSEC"); \ if (g_shsec_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_SHSEC_LOGREQ(lvl, bp, ...) do { \ if (g_shsec_debug >= (lvl)) { \ printf("GEOM_SHSEC"); \ if (g_shsec_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) struct g_shsec_softc { u_int sc_type; /* provider type */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* device unique ID */ struct g_consumer **sc_disks; uint16_t sc_ndisks; }; #define sc_name sc_geom->name #endif /* _KERNEL */ struct g_shsec_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Stripe name. */ uint32_t md_id; /* Unique ID. */ uint16_t md_no; /* Disk number. */ uint16_t md_all; /* Number of all disks. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void shsec_metadata_encode(const struct g_shsec_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); bcopy(md->md_provider, data + 44, sizeof(md->md_provider)); le64enc(data + 60, md->md_provsize); } static __inline void shsec_metadata_decode(const u_char *data, struct g_shsec_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); bcopy(data + 44, md->md_provider, sizeof(md->md_provider)); md->md_provsize = le64dec(data + 60); } #endif /* _G_SHSEC_H_ */ Index: head/sys/geom/stripe/g_stripe.c =================================================================== --- head/sys/geom/stripe/g_stripe.c (revision 326269) +++ head/sys/geom/stripe/g_stripe.c (revision 326270) @@ -1,1270 +1,1272 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_stripe, "GEOM striping support"); static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data"); static uma_zone_t g_stripe_zone; static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force); static int g_stripe_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_stripe_taste; static g_ctl_req_t g_stripe_config; static g_dumpconf_t g_stripe_dumpconf; static g_init_t g_stripe_init; static g_fini_t g_stripe_fini; struct g_class g_stripe_class = { .name = G_STRIPE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_stripe_config, .taste = g_stripe_taste, .destroy_geom = g_stripe_destroy_geom, .init = g_stripe_init, .fini = g_stripe_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW, 0, "GEOM_STRIPE stuff"); static u_int g_stripe_debug = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0, "Debug level"); static int g_stripe_fast = 0; static int g_sysctl_stripe_fast(SYSCTL_HANDLER_ARGS) { int error, fast; fast = g_stripe_fast; error = sysctl_handle_int(oidp, &fast, 0, req); if (error == 0 && req->newptr != NULL) g_stripe_fast = fast; return (error); } SYSCTL_PROC(_kern_geom_stripe, OID_AUTO, fast, CTLTYPE_INT | CTLFLAG_RWTUN, NULL, 0, g_sysctl_stripe_fast, "I", "Fast, but memory-consuming, mode"); static u_int g_stripe_maxmem = MAXPHYS * 100; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_stripe_maxmem, 0, "Maximum memory that can be allocated in \"fast\" mode (in bytes)"); static u_int g_stripe_fast_failed = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD, &g_stripe_fast_failed, 0, "How many times \"fast\" mode failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_stripe_init(struct g_class *mp __unused) { g_stripe_zone = uma_zcreate("g_stripe_zone", MAXPHYS, NULL, NULL, NULL, NULL, 0, 0); g_stripe_maxmem -= g_stripe_maxmem % MAXPHYS; uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / MAXPHYS); } static void g_stripe_fini(struct g_class *mp __unused) { uma_zdestroy(g_stripe_zone); } /* * Return the number of valid disks. */ static u_int g_stripe_nvalid(struct g_stripe_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_stripe_remove_disk(struct g_consumer *cp) { struct g_stripe_softc *sc; g_topology_assert(); KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_stripe_softc *)cp->geom->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); if (cp->private == NULL) { G_STRIPE_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); cp->private = (void *)(uintptr_t)-1; } if (sc->sc_provider != NULL) { G_STRIPE_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; sc->sc_disks[cp->index] = NULL; cp->index = 0; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_stripe_destroy(sc, 1); } static void g_stripe_orphan(struct g_consumer *cp) { struct g_stripe_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_stripe_remove_disk(cp); } static int g_stripe_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_stripe_softc *sc; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; sc = gp->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && cp1->private != NULL) { g_stripe_remove_disk(cp1); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_stripe_copy(struct g_stripe_softc *sc, char *src, char *dst, off_t offset, off_t length, int mode) { u_int stripesize; size_t len; stripesize = sc->sc_stripesize; len = (size_t)(stripesize - (offset & (stripesize - 1))); do { bcopy(src, dst, len); if (mode) { dst += len + stripesize * (sc->sc_ndisks - 1); src += len; } else { dst += len; src += len + stripesize * (sc->sc_ndisks - 1); } length -= len; KASSERT(length >= 0, ("Length < 0 (stripesize=%zu, offset=%jd, length=%jd).", (size_t)stripesize, (intmax_t)offset, (intmax_t)length)); if (length > stripesize) len = stripesize; else len = length; } while (length > 0); } static void g_stripe_done(struct bio *bp) { struct g_stripe_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) { g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset, bp->bio_length, 1); bp->bio_data = bp->bio_caller1; bp->bio_caller1 = NULL; } mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); if (pbp->bio_driver1 != NULL) uma_zfree(g_stripe_zone, pbp->bio_driver1); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } static int g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); u_int nparts = 0, stripesize; struct g_stripe_softc *sc; char *addr, *data = NULL; struct bio *cbp; int error; sc = bp->bio_to->geom->softc; addr = bp->bio_data; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; cbp->bio_length = length; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize, addr += stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } if (nparts >= sc->sc_ndisks) { cbp = TAILQ_NEXT(cbp, bio_queue); if (cbp == NULL) cbp = TAILQ_FIRST(&queue); nparts++; /* * Update bio structure. */ /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length += MIN(stripesize, length); if (cbp->bio_caller1 == NULL) { cbp->bio_caller1 = cbp->bio_data; cbp->bio_data = NULL; if (data == NULL) { data = uma_zalloc(g_stripe_zone, M_NOWAIT); if (data == NULL) { error = ENOMEM; goto failure; } } } } else { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); cbp->bio_caller2 = sc->sc_disks[no]; } } if (data != NULL) bp->bio_driver1 = data; /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; if (cbp->bio_caller1 != NULL) { cbp->bio_data = data; if (bp->bio_cmd == BIO_WRITE) { g_stripe_copy(sc, cbp->bio_caller1, data, cbp->bio_offset, cbp->bio_length, 0); } data += cbp->bio_length; } G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: if (data != NULL) uma_zfree(g_stripe_zone, data); while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); if (cbp->bio_caller1 != NULL) { cbp->bio_data = cbp->bio_caller1; cbp->bio_caller1 = NULL; } bp->bio_children--; g_destroy_bio(cbp); } return (error); } static int g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_stripe_softc *sc; uint32_t stripesize; struct bio *cbp; char *addr; int error; sc = bp->bio_to->geom->softc; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ if (bp->bio_length == length) cbp->bio_done = g_std_done; /* Optimized lockless case. */ else cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; addr = NULL; } else addr = bp->bio_data; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); if (bp->bio_cmd != BIO_DELETE) addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd != BIO_DELETE) addr += stripesize; } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; g_destroy_bio(cbp); } return (error); } static void g_stripe_flush(struct g_stripe_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_stripe_done; cbp->bio_caller2 = sc->sc_disks[no]; cbp->bio_to = sc->sc_disks[no]->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_STRIPE_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; g_io_request(cbp, cp); } } static void g_stripe_start(struct bio *bp) { off_t offset, start, length, nstripe; struct g_stripe_softc *sc; u_int no, stripesize; int error, fast = 0; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_stripe_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_STRIPE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_stripe_flush(sc, bp); return; case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } stripesize = sc->sc_stripesize; /* * Calculations are quite messy, but fast I hope. */ /* Stripe number. */ /* nstripe = bp->bio_offset / stripesize; */ nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits; /* Disk number. */ no = nstripe % sc->sc_ndisks; /* Start position in stripe. */ /* start = bp->bio_offset % stripesize; */ start = bp->bio_offset & (stripesize - 1); /* Start position in disk. */ /* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */ offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start; /* Length of data to operate. */ length = MIN(bp->bio_length, stripesize - start); /* * Do use "fast" mode when: * 1. "Fast" mode is ON. * and * 2. Request size is less than or equal to MAXPHYS, * which should always be true. * and * 3. Request size is bigger than stripesize * ndisks. If it isn't, * there will be no need to send more than one I/O request to * a provider, so there is nothing to optmize. * and * 4. Request is not unmapped. * and * 5. It is not a BIO_DELETE. */ if (g_stripe_fast && bp->bio_length <= MAXPHYS && bp->bio_length >= stripesize * sc->sc_ndisks && (bp->bio_flags & BIO_UNMAPPED) == 0 && bp->bio_cmd != BIO_DELETE) { fast = 1; } error = 0; if (fast) { error = g_stripe_start_fast(bp, no, offset, length); if (error != 0) g_stripe_fast_failed++; } /* * Do use "economic" when: * 1. "Economic" mode is ON. * or * 2. "Fast" mode failed. It can only fail if there is no memory. */ if (!fast || error != 0) error = g_stripe_start_economic(bp, no, offset, length); if (error != 0) { if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } } static void g_stripe_check_and_run(struct g_stripe_softc *sc) { struct g_provider *dp; off_t mediasize, ms; u_int no, sectorsize = 0; g_topology_assert(); if (g_stripe_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s", sc->sc_name); sc->sc_provider->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (g_stripe_fast == 0) sc->sc_provider->flags |= G_PF_ACCEPT_UNMAPPED; /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) mediasize -= sc->sc_disks[0]->provider->sectorsize; mediasize -= mediasize % sc->sc_stripesize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { dp = sc->sc_disks[no]->provider; ms = dp->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) ms -= dp->sectorsize; ms -= ms % sc->sc_stripesize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_STRIPE_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED; } } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize * sc->sc_ndisks; sc->sc_provider->stripesize = sc->sc_stripesize; sc->sc_provider->stripeoffset = 0; g_error_provider(sc->sc_provider, 0); G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_stripe_read_metadata(struct g_consumer *cp, struct g_stripe_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ stripe_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_stripe_add_disk(struct g_stripe_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; cp->private = NULL; cp->index = no; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) { struct g_stripe_metadata md; /* Reread metadata. */ error = g_stripe_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } sc->sc_disks[no] = cp; G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_stripe_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md, u_int type) { struct g_stripe_softc *sc; struct g_geom *gp; u_int no; g_topology_assert(); G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } #if 0 /* Stripe size have to be grater than or equal to sector size. */ if (md->md_stripesize < sectorsize) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } #endif /* Stripe size have to be power of 2. */ if (!powerof2(md->md_stripesize)) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_STRIPE_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK | M_ZERO); gp->start = g_stripe_start; gp->spoiled = g_stripe_orphan; gp->orphan = g_stripe_orphan; gp->access = g_stripe_access; gp->dumpconf = g_stripe_dumpconf; sc->sc_id = md->md_id; sc->sc_stripesize = md->md_stripesize; sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1); sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_STRIPE, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_STRIPE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_STRIPE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_stripe_remove_disk(cp); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_STRIPE); mtx_destroy(&sc->sc_lock); free(sc, M_STRIPE); G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_stripe_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_stripe_softc *sc; sc = gp->softc; return (g_stripe_destroy(sc, 0)); } static struct g_geom * g_stripe_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_stripe_metadata md; struct g_stripe_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_STRIPE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "stripe:taste"); gp->start = g_stripe_start; gp->access = g_stripe_access; gp->orphan = g_stripe_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_stripe_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0) return (NULL); if (md.md_version > G_STRIPE_VERSION) { printf("geom_stripe.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 2) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 3) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC); if (gp == NULL) { G_STRIPE_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_stripe_destroy(sc, 1); return (NULL); } } return (gp); } static void g_stripe_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_stripe_metadata md; struct g_provider *pp; struct g_stripe_softc *sc; struct g_geom *gp; struct sbuf *sb; intmax_t *stripesize; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic)); md.md_version = G_STRIPE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "No '%s' argument.", "stripesize"); return; } md.md_stripesize = *stripesize; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); return; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); if (pp == NULL) { G_STRIPE_DEBUG(1, "Disk %s is invalid.", name); gctl_error(req, "Disk %s is invalid.", name); return; } } gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", no); continue; } if (strncmp(name, "/dev/", strlen("/dev/")) == 0) name += strlen("/dev/"); pp = g_provider_by_name(name); KASSERT(pp != NULL, ("Provider %s disappear?!", name)); if (g_stripe_add_disk(sc, pp, no - 1) != 0) { G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_stripe_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_stripe_softc * g_stripe_find_device(struct g_class *mp, const char *name) { struct g_stripe_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_stripe_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_stripe_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_stripe_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_stripe_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_stripe_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_STRIPE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_stripe_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_stripe_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_stripe_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_stripe_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_stripesize); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_STRIPE_TYPE_AUTOMATIC: sbuf_printf(sb, "AUTOMATIC"); break; case G_STRIPE_TYPE_MANUAL: sbuf_printf(sb, "MANUAL"); break; default: sbuf_printf(sb, "UNKNOWN"); break; } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_stripe_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_printf(sb, "UP"); else sbuf_printf(sb, "DOWN"); sbuf_printf(sb, "\n"); } } DECLARE_GEOM_CLASS(g_stripe_class, g_stripe); Index: head/sys/geom/stripe/g_stripe.h =================================================================== --- head/sys/geom/stripe/g_stripe.h (revision 326269) +++ head/sys/geom/stripe/g_stripe.h (revision 326270) @@ -1,124 +1,126 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_STRIPE_H_ #define _G_STRIPE_H_ #include #define G_STRIPE_CLASS_NAME "STRIPE" #define G_STRIPE_MAGIC "GEOM::STRIPE" /* * Version history: * 0 - Initial version number. * 1 - Added 'stop' command for gstripe(8). * 2 - Added md_provider field to metadata and '-h' option for gstripe(8). * 3 - Added md_provsize field to metadata. */ #define G_STRIPE_VERSION 3 #ifdef _KERNEL #define G_STRIPE_TYPE_MANUAL 0 #define G_STRIPE_TYPE_AUTOMATIC 1 #define G_STRIPE_DEBUG(lvl, ...) do { \ if (g_stripe_debug >= (lvl)) { \ printf("GEOM_STRIPE"); \ if (g_stripe_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_STRIPE_LOGREQ(bp, ...) do { \ if (g_stripe_debug >= 2) { \ printf("GEOM_STRIPE[2]: "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) struct g_stripe_softc { u_int sc_type; /* provider type */ struct g_geom *sc_geom; struct g_provider *sc_provider; uint32_t sc_id; /* stripe unique ID */ struct g_consumer **sc_disks; uint16_t sc_ndisks; uint32_t sc_stripesize; uint32_t sc_stripebits; struct mtx sc_lock; }; #define sc_name sc_geom->name #endif /* _KERNEL */ struct g_stripe_metadata { char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Stripe name. */ uint32_t md_id; /* Unique ID. */ uint16_t md_no; /* Disk number. */ uint16_t md_all; /* Number of all disks. */ uint32_t md_stripesize; /* Stripe size. */ char md_provider[16]; /* Hardcoded provider. */ uint64_t md_provsize; /* Provider's size. */ }; static __inline void stripe_metadata_encode(const struct g_stripe_metadata *md, u_char *data) { bcopy(md->md_magic, data, sizeof(md->md_magic)); le32enc(data + 16, md->md_version); bcopy(md->md_name, data + 20, sizeof(md->md_name)); le32enc(data + 36, md->md_id); le16enc(data + 40, md->md_no); le16enc(data + 42, md->md_all); le32enc(data + 44, md->md_stripesize); bcopy(md->md_provider, data + 48, sizeof(md->md_provider)); le64enc(data + 64, md->md_provsize); } static __inline void stripe_metadata_decode(const u_char *data, struct g_stripe_metadata *md) { bcopy(data, md->md_magic, sizeof(md->md_magic)); md->md_version = le32dec(data + 16); bcopy(data + 20, md->md_name, sizeof(md->md_name)); md->md_id = le32dec(data + 36); md->md_no = le16dec(data + 40); md->md_all = le16dec(data + 42); md->md_stripesize = le32dec(data + 44); bcopy(data + 48, md->md_provider, sizeof(md->md_provider)); md->md_provsize = le64dec(data + 64); } #endif /* _G_STRIPE_H_ */ Index: head/sys/geom/uzip/g_uzip.c =================================================================== --- head/sys/geom/uzip/g_uzip.c (revision 326269) +++ head/sys/geom/uzip/g_uzip.c (revision 326270) @@ -1,922 +1,924 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004 Max Khon * Copyright (c) 2014 Juniper Networks, Inc. * Copyright (c) 2006-2016 Maxim Sobolev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_geom.h" MALLOC_DEFINE(M_GEOM_UZIP, "geom_uzip", "GEOM UZIP data structures"); FEATURE(geom_uzip, "GEOM read-only compressed disks support"); struct g_uzip_blk { uint64_t offset; uint32_t blen; unsigned char last:1; unsigned char padded:1; #define BLEN_UNDEF UINT32_MAX }; #ifndef ABS #define ABS(a) ((a) < 0 ? -(a) : (a)) #endif #define BLK_IN_RANGE(mcn, bcn, ilen) \ (((bcn) != BLEN_UNDEF) && ( \ ((ilen) >= 0 && (mcn >= bcn) && (mcn <= ((intmax_t)(bcn) + (ilen)))) || \ ((ilen) < 0 && (mcn <= bcn) && (mcn >= ((intmax_t)(bcn) + (ilen)))) \ )) #ifdef GEOM_UZIP_DEBUG # define GEOM_UZIP_DBG_DEFAULT 3 #else # define GEOM_UZIP_DBG_DEFAULT 0 #endif #define GUZ_DBG_ERR 1 #define GUZ_DBG_INFO 2 #define GUZ_DBG_IO 3 #define GUZ_DBG_TOC 4 #define GUZ_DEV_SUFX ".uzip" #define GUZ_DEV_NAME(p) (p GUZ_DEV_SUFX) static char g_uzip_attach_to[MAXPATHLEN] = {"*"}; static char g_uzip_noattach_to[MAXPATHLEN] = {GUZ_DEV_NAME("*")}; TUNABLE_STR("kern.geom.uzip.attach_to", g_uzip_attach_to, sizeof(g_uzip_attach_to)); TUNABLE_STR("kern.geom.uzip.noattach_to", g_uzip_noattach_to, sizeof(g_uzip_noattach_to)); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, uzip, CTLFLAG_RW, 0, "GEOM_UZIP stuff"); static u_int g_uzip_debug = GEOM_UZIP_DBG_DEFAULT; SYSCTL_UINT(_kern_geom_uzip, OID_AUTO, debug, CTLFLAG_RWTUN, &g_uzip_debug, 0, "Debug level (0-4)"); static u_int g_uzip_debug_block = BLEN_UNDEF; SYSCTL_UINT(_kern_geom_uzip, OID_AUTO, debug_block, CTLFLAG_RWTUN, &g_uzip_debug_block, 0, "Debug operations around specific cluster#"); #define DPRINTF(lvl, a) \ if ((lvl) <= g_uzip_debug) { \ printf a; \ } #define DPRINTF_BLK(lvl, cn, a) \ if ((lvl) <= g_uzip_debug || \ BLK_IN_RANGE(cn, g_uzip_debug_block, 8) || \ BLK_IN_RANGE(cn, g_uzip_debug_block, -8)) { \ printf a; \ } #define DPRINTF_BRNG(lvl, bcn, ecn, a) \ KASSERT(bcn < ecn, ("DPRINTF_BRNG: invalid range (%ju, %ju)", \ (uintmax_t)bcn, (uintmax_t)ecn)); \ if (((lvl) <= g_uzip_debug) || \ BLK_IN_RANGE(g_uzip_debug_block, bcn, \ (intmax_t)ecn - (intmax_t)bcn)) { \ printf a; \ } #define UZIP_CLASS_NAME "UZIP" /* * Maximum allowed valid block size (to prevent foot-shooting) */ #define MAX_BLKSZ (MAXPHYS) static char CLOOP_MAGIC_START[] = "#!/bin/sh\n"; static void g_uzip_read_done(struct bio *bp); static void g_uzip_do(struct g_uzip_softc *, struct bio *bp); static void g_uzip_softc_free(struct g_uzip_softc *sc, struct g_geom *gp) { if (gp != NULL) { DPRINTF(GUZ_DBG_INFO, ("%s: %d requests, %d cached\n", gp->name, sc->req_total, sc->req_cached)); } mtx_lock(&sc->queue_mtx); sc->wrkthr_flags |= GUZ_SHUTDOWN; wakeup(sc); while (!(sc->wrkthr_flags & GUZ_EXITING)) { msleep(sc->procp, &sc->queue_mtx, PRIBIO, "guzfree", hz / 10); } mtx_unlock(&sc->queue_mtx); sc->dcp->free(sc->dcp); free(sc->toc, M_GEOM_UZIP); mtx_destroy(&sc->queue_mtx); mtx_destroy(&sc->last_mtx); free(sc->last_buf, M_GEOM_UZIP); free(sc, M_GEOM_UZIP); } static int g_uzip_cached(struct g_geom *gp, struct bio *bp) { struct g_uzip_softc *sc; off_t ofs; size_t blk, blkofs, usz; sc = gp->softc; ofs = bp->bio_offset + bp->bio_completed; blk = ofs / sc->blksz; mtx_lock(&sc->last_mtx); if (blk == sc->last_blk) { blkofs = ofs % sc->blksz; usz = sc->blksz - blkofs; if (bp->bio_resid < usz) usz = bp->bio_resid; memcpy(bp->bio_data + bp->bio_completed, sc->last_buf + blkofs, usz); sc->req_cached++; mtx_unlock(&sc->last_mtx); DPRINTF(GUZ_DBG_IO, ("%s/%s: %p: offset=%jd: got %jd bytes " "from cache\n", __func__, gp->name, bp, (intmax_t)ofs, (intmax_t)usz)); bp->bio_completed += usz; bp->bio_resid -= usz; if (bp->bio_resid == 0) { g_io_deliver(bp, 0); return (1); } } else mtx_unlock(&sc->last_mtx); return (0); } #define BLK_ENDS(sc, bi) ((sc)->toc[(bi)].offset + \ (sc)->toc[(bi)].blen) #define BLK_IS_CONT(sc, bi) (BLK_ENDS((sc), (bi) - 1) == \ (sc)->toc[(bi)].offset) #define BLK_IS_NIL(sc, bi) ((sc)->toc[(bi)].blen == 0) #define TOFF_2_BOFF(sc, pp, bi) ((sc)->toc[(bi)].offset - \ (sc)->toc[(bi)].offset % (pp)->sectorsize) #define TLEN_2_BLEN(sc, pp, bp, ei) roundup(BLK_ENDS((sc), (ei)) - \ (bp)->bio_offset, (pp)->sectorsize) static int g_uzip_request(struct g_geom *gp, struct bio *bp) { struct g_uzip_softc *sc; struct bio *bp2; struct g_consumer *cp; struct g_provider *pp; off_t ofs, start_blk_ofs; size_t i, start_blk, end_blk, zsize; if (g_uzip_cached(gp, bp) != 0) return (1); sc = gp->softc; cp = LIST_FIRST(&gp->consumer); pp = cp->provider; ofs = bp->bio_offset + bp->bio_completed; start_blk = ofs / sc->blksz; KASSERT(start_blk < sc->nblocks, ("start_blk out of range")); end_blk = howmany(ofs + bp->bio_resid, sc->blksz); KASSERT(end_blk <= sc->nblocks, ("end_blk out of range")); for (; BLK_IS_NIL(sc, start_blk) && start_blk < end_blk; start_blk++) { /* Fill in any leading Nil blocks */ start_blk_ofs = ofs % sc->blksz; zsize = MIN(sc->blksz - start_blk_ofs, bp->bio_resid); DPRINTF_BLK(GUZ_DBG_IO, start_blk, ("%s/%s: %p/%ju: " "filling %ju zero bytes\n", __func__, gp->name, gp, (uintmax_t)bp->bio_completed, (uintmax_t)zsize)); bzero(bp->bio_data + bp->bio_completed, zsize); bp->bio_completed += zsize; bp->bio_resid -= zsize; ofs += zsize; } if (start_blk == end_blk) { KASSERT(bp->bio_resid == 0, ("bp->bio_resid is invalid")); /* * No non-Nil data is left, complete request immediately. */ DPRINTF(GUZ_DBG_IO, ("%s/%s: %p: all done returning %ju " "bytes\n", __func__, gp->name, gp, (uintmax_t)bp->bio_completed)); g_io_deliver(bp, 0); return (1); } for (i = start_blk + 1; i < end_blk; i++) { /* Trim discontinuous areas if any */ if (!BLK_IS_CONT(sc, i)) { end_blk = i; break; } } DPRINTF_BRNG(GUZ_DBG_IO, start_blk, end_blk, ("%s/%s: %p: " "start=%u (%ju[%jd]), end=%u (%ju)\n", __func__, gp->name, bp, (u_int)start_blk, (uintmax_t)sc->toc[start_blk].offset, (intmax_t)sc->toc[start_blk].blen, (u_int)end_blk, (uintmax_t)BLK_ENDS(sc, end_blk - 1))); bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return (1); } bp2->bio_done = g_uzip_read_done; bp2->bio_offset = TOFF_2_BOFF(sc, pp, start_blk); while (1) { bp2->bio_length = TLEN_2_BLEN(sc, pp, bp2, end_blk - 1); if (bp2->bio_length <= MAXPHYS) { break; } if (end_blk == (start_blk + 1)) { break; } end_blk--; } DPRINTF(GUZ_DBG_IO, ("%s/%s: bp2->bio_length = %jd, " "bp2->bio_offset = %jd\n", __func__, gp->name, (intmax_t)bp2->bio_length, (intmax_t)bp2->bio_offset)); bp2->bio_data = malloc(bp2->bio_length, M_GEOM_UZIP, M_NOWAIT); if (bp2->bio_data == NULL) { g_destroy_bio(bp2); g_io_deliver(bp, ENOMEM); return (1); } DPRINTF_BRNG(GUZ_DBG_IO, start_blk, end_blk, ("%s/%s: %p: " "reading %jd bytes from offset %jd\n", __func__, gp->name, bp, (intmax_t)bp2->bio_length, (intmax_t)bp2->bio_offset)); g_io_request(bp2, cp); return (0); } static void g_uzip_read_done(struct bio *bp) { struct bio *bp2; struct g_geom *gp; struct g_uzip_softc *sc; bp2 = bp->bio_parent; gp = bp2->bio_to->geom; sc = gp->softc; mtx_lock(&sc->queue_mtx); bioq_disksort(&sc->bio_queue, bp); mtx_unlock(&sc->queue_mtx); wakeup(sc); } static int g_uzip_memvcmp(const void *memory, unsigned char val, size_t size) { const u_char *mm; mm = (const u_char *)memory; return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0; } static void g_uzip_do(struct g_uzip_softc *sc, struct bio *bp) { struct bio *bp2; struct g_provider *pp; struct g_consumer *cp; struct g_geom *gp; char *data, *data2; off_t ofs; size_t blk, blkofs, len, ulen, firstblk; int err; bp2 = bp->bio_parent; gp = bp2->bio_to->geom; cp = LIST_FIRST(&gp->consumer); pp = cp->provider; bp2->bio_error = bp->bio_error; if (bp2->bio_error != 0) goto done; /* Make sure there's forward progress. */ if (bp->bio_completed == 0) { bp2->bio_error = ECANCELED; goto done; } ofs = bp2->bio_offset + bp2->bio_completed; firstblk = blk = ofs / sc->blksz; blkofs = ofs % sc->blksz; data = bp->bio_data + sc->toc[blk].offset % pp->sectorsize; data2 = bp2->bio_data + bp2->bio_completed; while (bp->bio_completed && bp2->bio_resid) { if (blk > firstblk && !BLK_IS_CONT(sc, blk)) { DPRINTF_BLK(GUZ_DBG_IO, blk, ("%s/%s: %p: backref'ed " "cluster #%u requested, looping around\n", __func__, gp->name, bp2, (u_int)blk)); goto done; } ulen = MIN(sc->blksz - blkofs, bp2->bio_resid); len = sc->toc[blk].blen; DPRINTF(GUZ_DBG_IO, ("%s/%s: %p/%ju: data2=%p, ulen=%u, " "data=%p, len=%u\n", __func__, gp->name, gp, bp->bio_completed, data2, (u_int)ulen, data, (u_int)len)); if (len == 0) { /* All zero block: no cache update */ zero_block: bzero(data2, ulen); } else if (len <= bp->bio_completed) { mtx_lock(&sc->last_mtx); err = sc->dcp->decompress(sc->dcp, gp->name, data, len, sc->last_buf); if (err != 0 && sc->toc[blk].last != 0) { /* * Last block decompression has failed, check * if it's just zero padding. */ if (g_uzip_memvcmp(data, '\0', len) == 0) { sc->toc[blk].blen = 0; sc->last_blk = -1; mtx_unlock(&sc->last_mtx); len = 0; goto zero_block; } } if (err != 0) { sc->last_blk = -1; mtx_unlock(&sc->last_mtx); bp2->bio_error = EILSEQ; DPRINTF(GUZ_DBG_ERR, ("%s/%s: decompress" "(%p, %ju, %ju) failed\n", __func__, gp->name, sc->dcp, (uintmax_t)blk, (uintmax_t)len)); goto done; } sc->last_blk = blk; memcpy(data2, sc->last_buf + blkofs, ulen); mtx_unlock(&sc->last_mtx); err = sc->dcp->rewind(sc->dcp, gp->name); if (err != 0) { bp2->bio_error = EILSEQ; DPRINTF(GUZ_DBG_ERR, ("%s/%s: rewind(%p) " "failed\n", __func__, gp->name, sc->dcp)); goto done; } data += len; } else break; data2 += ulen; bp2->bio_completed += ulen; bp2->bio_resid -= ulen; bp->bio_completed -= len; blkofs = 0; blk++; } done: /* Finish processing the request. */ free(bp->bio_data, M_GEOM_UZIP); g_destroy_bio(bp); if (bp2->bio_error != 0 || bp2->bio_resid == 0) g_io_deliver(bp2, bp2->bio_error); else g_uzip_request(gp, bp2); } static void g_uzip_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_uzip_softc *sc; pp = bp->bio_to; gp = pp->geom; DPRINTF(GUZ_DBG_IO, ("%s/%s: %p: cmd=%d, offset=%jd, length=%jd, " "buffer=%p\n", __func__, gp->name, bp, bp->bio_cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length, bp->bio_data)); sc = gp->softc; sc->req_total++; if (bp->bio_cmd == BIO_GETATTR) { struct bio *bp2; struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; /* pass on MNT:* requests and ignore others */ if (strncmp(bp->bio_attribute, "MNT:", 4) == 0) { bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_std_done; pp = bp->bio_to; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); g_io_request(bp2, cp); return; } } if (bp->bio_cmd != BIO_READ) { g_io_deliver(bp, EOPNOTSUPP); return; } bp->bio_resid = bp->bio_length; bp->bio_completed = 0; g_uzip_request(gp, bp); } static void g_uzip_orphan(struct g_consumer *cp) { struct g_geom *gp; g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, cp->provider->name); g_topology_assert(); gp = cp->geom; g_uzip_softc_free(gp->softc, gp); gp->softc = NULL; g_wither_geom(gp, ENXIO); } static int g_uzip_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); KASSERT (cp != NULL, ("g_uzip_access but no consumer")); if (cp->acw + dw > 0) return (EROFS); return (g_access(cp, dr, dw, de)); } static void g_uzip_spoiled(struct g_consumer *cp) { struct g_geom *gp; G_VALID_CONSUMER(cp); gp = cp->geom; g_trace(G_T_TOPOLOGY, "%s(%p/%s)", __func__, cp, gp->name); g_topology_assert(); g_uzip_softc_free(gp->softc, gp); gp->softc = NULL; g_wither_geom(gp, ENXIO); } static int g_uzip_parse_toc(struct g_uzip_softc *sc, struct g_provider *pp, struct g_geom *gp) { uint32_t i, j, backref_to; uint64_t max_offset, min_offset; struct g_uzip_blk *last_blk; min_offset = sizeof(struct cloop_header) + (sc->nblocks + 1) * sizeof(uint64_t); max_offset = sc->toc[0].offset - 1; last_blk = &sc->toc[0]; for (i = 0; i < sc->nblocks; i++) { /* First do some bounds checking */ if ((sc->toc[i].offset < min_offset) || (sc->toc[i].offset > pp->mediasize)) { goto error_offset; } DPRINTF_BLK(GUZ_DBG_IO, i, ("%s: cluster #%u " "offset=%ju max_offset=%ju\n", gp->name, (u_int)i, (uintmax_t)sc->toc[i].offset, (uintmax_t)max_offset)); backref_to = BLEN_UNDEF; if (sc->toc[i].offset < max_offset) { /* * For the backref'ed blocks search already parsed * TOC entries for the matching offset and copy the * size from matched entry. */ for (j = 0; j <= i; j++) { if (sc->toc[j].offset == sc->toc[i].offset && !BLK_IS_NIL(sc, j)) { break; } if (j != i) { continue; } DPRINTF(GUZ_DBG_ERR, ("%s: cannot match " "backref'ed offset at cluster #%u\n", gp->name, i)); return (-1); } sc->toc[i].blen = sc->toc[j].blen; backref_to = j; } else { last_blk = &sc->toc[i]; /* * For the "normal blocks" seek forward until we hit * block whose offset is larger than ours and assume * it's going to be the next one. */ for (j = i + 1; j < sc->nblocks; j++) { if (sc->toc[j].offset > max_offset) { break; } } sc->toc[i].blen = sc->toc[j].offset - sc->toc[i].offset; if (BLK_ENDS(sc, i) > pp->mediasize) { DPRINTF(GUZ_DBG_ERR, ("%s: cluster #%u " "extends past media boundary (%ju > %ju)\n", gp->name, (u_int)i, (uintmax_t)BLK_ENDS(sc, i), (intmax_t)pp->mediasize)); return (-1); } KASSERT(max_offset <= sc->toc[i].offset, ( "%s: max_offset is incorrect: %ju", gp->name, (uintmax_t)max_offset)); max_offset = BLK_ENDS(sc, i) - 1; } DPRINTF_BLK(GUZ_DBG_TOC, i, ("%s: cluster #%u, original %u " "bytes, in %u bytes", gp->name, i, sc->blksz, sc->toc[i].blen)); if (backref_to != BLEN_UNDEF) { DPRINTF_BLK(GUZ_DBG_TOC, i, (" (->#%u)", (u_int)backref_to)); } DPRINTF_BLK(GUZ_DBG_TOC, i, ("\n")); } last_blk->last = 1; /* Do a second pass to validate block lengths */ for (i = 0; i < sc->nblocks; i++) { if (sc->toc[i].blen > sc->dcp->max_blen) { if (sc->toc[i].last == 0) { DPRINTF(GUZ_DBG_ERR, ("%s: cluster #%u " "length (%ju) exceeds " "max_blen (%ju)\n", gp->name, i, (uintmax_t)sc->toc[i].blen, (uintmax_t)sc->dcp->max_blen)); return (-1); } DPRINTF(GUZ_DBG_INFO, ("%s: cluster #%u extra " "padding is detected, trimmed to %ju\n", gp->name, i, (uintmax_t)sc->dcp->max_blen)); sc->toc[i].blen = sc->dcp->max_blen; sc->toc[i].padded = 1; } } return (0); error_offset: DPRINTF(GUZ_DBG_ERR, ("%s: cluster #%u: invalid offset %ju, " "min_offset=%ju mediasize=%jd\n", gp->name, (u_int)i, sc->toc[i].offset, min_offset, pp->mediasize)); return (-1); } static struct g_geom * g_uzip_taste(struct g_class *mp, struct g_provider *pp, int flags) { int error; uint32_t i, total_offsets, offsets_read, blk; void *buf; struct cloop_header *header; struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp2; struct g_uzip_softc *sc; enum { G_UZIP = 1, G_ULZMA } type; g_trace(G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); if ((fnmatch(g_uzip_attach_to, pp->name, 0) != 0) || (fnmatch(g_uzip_noattach_to, pp->name, 0) == 0)) { DPRINTF(GUZ_DBG_INFO, ("%s(%s,%s), ignoring\n", __func__, mp->name, pp->name)); return (NULL); } buf = NULL; /* * Create geom instance. */ gp = g_new_geomf(mp, GUZ_DEV_NAME("%s"), pp->name); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error) { goto e1; } g_topology_unlock(); /* * Read cloop header, look for CLOOP magic, perform * other validity checks. */ DPRINTF(GUZ_DBG_INFO, ("%s: media sectorsize %u, mediasize %jd\n", gp->name, pp->sectorsize, (intmax_t)pp->mediasize)); buf = g_read_data(cp, 0, pp->sectorsize, NULL); if (buf == NULL) goto e2; header = (struct cloop_header *) buf; if (strncmp(header->magic, CLOOP_MAGIC_START, sizeof(CLOOP_MAGIC_START) - 1) != 0) { DPRINTF(GUZ_DBG_ERR, ("%s: no CLOOP magic\n", gp->name)); goto e3; } switch (header->magic[CLOOP_OFS_COMPR]) { case CLOOP_COMP_LZMA: case CLOOP_COMP_LZMA_DDP: type = G_ULZMA; if (header->magic[CLOOP_OFS_VERSN] < CLOOP_MINVER_LZMA) { DPRINTF(GUZ_DBG_ERR, ("%s: image version too old\n", gp->name)); goto e3; } DPRINTF(GUZ_DBG_INFO, ("%s: GEOM_UZIP_LZMA image found\n", gp->name)); break; case CLOOP_COMP_LIBZ: case CLOOP_COMP_LIBZ_DDP: type = G_UZIP; if (header->magic[CLOOP_OFS_VERSN] < CLOOP_MINVER_ZLIB) { DPRINTF(GUZ_DBG_ERR, ("%s: image version too old\n", gp->name)); goto e3; } DPRINTF(GUZ_DBG_INFO, ("%s: GEOM_UZIP_ZLIB image found\n", gp->name)); break; default: DPRINTF(GUZ_DBG_ERR, ("%s: unsupported image type\n", gp->name)); goto e3; } /* * Initialize softc and read offsets. */ sc = malloc(sizeof(*sc), M_GEOM_UZIP, M_WAITOK | M_ZERO); gp->softc = sc; sc->blksz = ntohl(header->blksz); sc->nblocks = ntohl(header->nblocks); if (sc->blksz % 512 != 0) { printf("%s: block size (%u) should be multiple of 512.\n", gp->name, sc->blksz); goto e4; } if (sc->blksz > MAX_BLKSZ) { printf("%s: block size (%u) should not be larger than %d.\n", gp->name, sc->blksz, MAX_BLKSZ); } total_offsets = sc->nblocks + 1; if (sizeof(struct cloop_header) + total_offsets * sizeof(uint64_t) > pp->mediasize) { printf("%s: media too small for %u blocks\n", gp->name, sc->nblocks); goto e4; } sc->toc = malloc(total_offsets * sizeof(struct g_uzip_blk), M_GEOM_UZIP, M_WAITOK | M_ZERO); offsets_read = MIN(total_offsets, (pp->sectorsize - sizeof(*header)) / sizeof(uint64_t)); for (i = 0; i < offsets_read; i++) { sc->toc[i].offset = be64toh(((uint64_t *) (header + 1))[i]); sc->toc[i].blen = BLEN_UNDEF; } DPRINTF(GUZ_DBG_INFO, ("%s: %u offsets in the first sector\n", gp->name, offsets_read)); for (blk = 1; offsets_read < total_offsets; blk++) { uint32_t nread; free(buf, M_GEOM); buf = g_read_data( cp, blk * pp->sectorsize, pp->sectorsize, NULL); if (buf == NULL) goto e5; nread = MIN(total_offsets - offsets_read, pp->sectorsize / sizeof(uint64_t)); DPRINTF(GUZ_DBG_TOC, ("%s: %u offsets read from sector %d\n", gp->name, nread, blk)); for (i = 0; i < nread; i++) { sc->toc[offsets_read + i].offset = be64toh(((uint64_t *) buf)[i]); sc->toc[offsets_read + i].blen = BLEN_UNDEF; } offsets_read += nread; } free(buf, M_GEOM); buf = NULL; offsets_read -= 1; DPRINTF(GUZ_DBG_INFO, ("%s: done reading %u block offsets from %u " "sectors\n", gp->name, offsets_read, blk)); if (sc->nblocks != offsets_read) { DPRINTF(GUZ_DBG_ERR, ("%s: read %s offsets than expected " "blocks\n", gp->name, sc->nblocks < offsets_read ? "more" : "less")); goto e5; } if (type == G_UZIP) { sc->dcp = g_uzip_zlib_ctor(sc->blksz); } else { sc->dcp = g_uzip_lzma_ctor(sc->blksz); } if (sc->dcp == NULL) { goto e5; } /* * "Fake" last+1 block, to make it easier for the TOC parser to * iterate without making the last element a special case. */ sc->toc[sc->nblocks].offset = pp->mediasize; /* Massage TOC (table of contents), make sure it is sound */ if (g_uzip_parse_toc(sc, pp, gp) != 0) { DPRINTF(GUZ_DBG_ERR, ("%s: TOC error\n", gp->name)); goto e6; } mtx_init(&sc->last_mtx, "geom_uzip cache", NULL, MTX_DEF); mtx_init(&sc->queue_mtx, "geom_uzip wrkthread", NULL, MTX_DEF); bioq_init(&sc->bio_queue); sc->last_blk = -1; sc->last_buf = malloc(sc->blksz, M_GEOM_UZIP, M_WAITOK); sc->req_total = 0; sc->req_cached = 0; sc->uzip_do = &g_uzip_do; error = kproc_create(g_uzip_wrkthr, sc, &sc->procp, 0, 0, "%s", gp->name); if (error != 0) { goto e7; } g_topology_lock(); pp2 = g_new_providerf(gp, "%s", gp->name); pp2->sectorsize = 512; pp2->mediasize = (off_t)sc->nblocks * sc->blksz; pp2->stripesize = pp->stripesize; pp2->stripeoffset = pp->stripeoffset; g_error_provider(pp2, 0); g_access(cp, -1, 0, 0); DPRINTF(GUZ_DBG_INFO, ("%s: taste ok (%d, %jd), (%d, %d), %x\n", gp->name, pp2->sectorsize, (intmax_t)pp2->mediasize, pp2->stripeoffset, pp2->stripesize, pp2->flags)); DPRINTF(GUZ_DBG_INFO, ("%s: %u x %u blocks\n", gp->name, sc->nblocks, sc->blksz)); return (gp); e7: free(sc->last_buf, M_GEOM); mtx_destroy(&sc->queue_mtx); mtx_destroy(&sc->last_mtx); e6: sc->dcp->free(sc->dcp); e5: free(sc->toc, M_GEOM); e4: free(gp->softc, M_GEOM_UZIP); e3: if (buf != NULL) { free(buf, M_GEOM); } e2: g_topology_lock(); g_access(cp, -1, 0, 0); e1: g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } static int g_uzip_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct g_provider *pp; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, gp->name); g_topology_assert(); if (gp->softc == NULL) { DPRINTF(GUZ_DBG_ERR, ("%s(%s): gp->softc == NULL\n", __func__, gp->name)); return (ENXIO); } KASSERT(gp != NULL, ("NULL geom")); pp = LIST_FIRST(&gp->provider); KASSERT(pp != NULL, ("NULL provider")); if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) return (EBUSY); g_uzip_softc_free(gp->softc, gp); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (0); } static struct g_class g_uzip_class = { .name = UZIP_CLASS_NAME, .version = G_VERSION, .taste = g_uzip_taste, .destroy_geom = g_uzip_destroy_geom, .start = g_uzip_start, .orphan = g_uzip_orphan, .access = g_uzip_access, .spoiled = g_uzip_spoiled, }; DECLARE_GEOM_CLASS(g_uzip_class, g_uzip); MODULE_DEPEND(g_uzip, zlib, 1, 1, 1); Index: head/sys/geom/vinum/geom_vinum.c =================================================================== --- head/sys/geom/vinum/geom_vinum.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum.c (revision 326270) @@ -1,1048 +1,1050 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, vinum, CTLFLAG_RW, 0, "GEOM_VINUM stuff"); u_int g_vinum_debug = 0; SYSCTL_UINT(_kern_geom_vinum, OID_AUTO, debug, CTLFLAG_RWTUN, &g_vinum_debug, 0, "Debug level"); static int gv_create(struct g_geom *, struct gctl_req *); static void gv_attach(struct gv_softc *, struct gctl_req *); static void gv_detach(struct gv_softc *, struct gctl_req *); static void gv_parityop(struct gv_softc *, struct gctl_req *); static void gv_orphan(struct g_consumer *cp) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d; g_topology_assert(); KASSERT(cp != NULL, ("gv_orphan: null cp")); gp = cp->geom; KASSERT(gp != NULL, ("gv_orphan: null gp")); sc = gp->softc; KASSERT(sc != NULL, ("gv_orphan: null sc")); d = cp->private; KASSERT(d != NULL, ("gv_orphan: null d")); g_trace(G_T_TOPOLOGY, "gv_orphan(%s)", gp->name); gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); } void gv_start(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; gp = bp->bio_to->geom; sc = gp->softc; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } void gv_done(struct bio *bp) { struct g_geom *gp; struct gv_softc *sc; KASSERT(bp != NULL, ("NULL bp")); gp = bp->bio_from->geom; sc = gp->softc; mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_up, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct gv_softc *sc; struct gv_drive *d, *d2; int error; gp = pp->geom; sc = gp->softc; /* * We want to modify the read count with the write count in case we have * plexes in a RAID-5 organization. */ dr += dw; LIST_FOREACH(d, &sc->drives, drive) { if (d->consumer == NULL) continue; error = g_access(d->consumer, dr, dw, de); if (error) { LIST_FOREACH(d2, &sc->drives, drive) { if (d == d2) break; g_access(d2->consumer, -dr, -dw, -de); } G_VINUM_DEBUG(0, "g_access '%s' failed: %d", d->name, error); return (error); } } return (0); } static void gv_init(struct g_class *mp) { struct g_geom *gp; struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_init(%p)", mp); gp = g_new_geomf(mp, "VINUM"); gp->spoiled = gv_orphan; gp->orphan = gv_orphan; gp->access = gv_access; gp->start = gv_start; gp->softc = g_malloc(sizeof(struct gv_softc), M_WAITOK | M_ZERO); sc = gp->softc; sc->geom = gp; sc->bqueue_down = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); sc->bqueue_up = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(sc->bqueue_down); bioq_init(sc->bqueue_up); LIST_INIT(&sc->drives); LIST_INIT(&sc->subdisks); LIST_INIT(&sc->plexes); LIST_INIT(&sc->volumes); TAILQ_INIT(&sc->equeue); mtx_init(&sc->config_mtx, "gv_config", NULL, MTX_DEF); mtx_init(&sc->equeue_mtx, "gv_equeue", NULL, MTX_DEF); mtx_init(&sc->bqueue_mtx, "gv_bqueue", NULL, MTX_DEF); kproc_create(gv_worker, sc, &sc->worker, 0, 0, "gv_worker"); } static int gv_unload(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { struct gv_softc *sc; g_trace(G_T_TOPOLOGY, "gv_unload(%p)", mp); g_topology_assert(); sc = gp->softc; if (sc != NULL) { gv_worker_exit(sc); gp->softc = NULL; g_wither_geom(gp, ENXIO); } return (0); } /* Handle userland request of attaching object. */ static void gv_attach(struct gv_softc *sc, struct gctl_req *req) { struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; off_t *offset; int *rename, type_child, type_parent; char *child, *parent; child = gctl_get_param(req, "child", NULL); if (child == NULL) { gctl_error(req, "no child given"); return; } parent = gctl_get_param(req, "parent", NULL); if (parent == NULL) { gctl_error(req, "no parent given"); return; } offset = gctl_get_paraml(req, "offset", sizeof(*offset)); if (offset == NULL) { gctl_error(req, "no offset given"); return; } rename = gctl_get_paraml(req, "rename", sizeof(*rename)); if (rename == NULL) { gctl_error(req, "no rename flag given"); return; } type_child = gv_object_type(sc, child); type_parent = gv_object_type(sc, parent); switch (type_child) { case GV_TYPE_PLEX: if (type_parent != GV_TYPE_VOL) { gctl_error(req, "no such volume to attach to"); return; } v = gv_find_vol(sc, parent); p = gv_find_plex(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_PLEX, p, v, *offset, *rename); break; case GV_TYPE_SD: if (type_parent != GV_TYPE_PLEX) { gctl_error(req, "no such plex to attach to"); return; } p = gv_find_plex(sc, parent); s = gv_find_sd(sc, child); gv_post_event(sc, GV_EVENT_ATTACH_SD, s, p, *offset, *rename); break; default: gctl_error(req, "invalid child type"); break; } } /* Handle userland request of detaching object. */ static void gv_detach(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; struct gv_sd *s; int *flags, type; char *object; object = gctl_get_param(req, "object", NULL); if (object == NULL) { gctl_error(req, "no argument given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); type = gv_object_type(sc, object); switch (type) { case GV_TYPE_PLEX: p = gv_find_plex(sc, object); gv_post_event(sc, GV_EVENT_DETACH_PLEX, p, NULL, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); gv_post_event(sc, GV_EVENT_DETACH_SD, s, NULL, *flags, 0); break; default: gctl_error(req, "invalid object type"); break; } } /* Handle userland requests for creating new objects. */ static int gv_create(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d, *d2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_volume *v, *v2; struct g_provider *pp; int error, i, *drives, *flags, *plexes, *subdisks, *volumes; char buf[20]; g_topology_assert(); sc = gp->softc; /* Find out how many of each object have been passed in. */ volumes = gctl_get_paraml(req, "volumes", sizeof(*volumes)); plexes = gctl_get_paraml(req, "plexes", sizeof(*plexes)); subdisks = gctl_get_paraml(req, "subdisks", sizeof(*subdisks)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (volumes == NULL || plexes == NULL || subdisks == NULL || drives == NULL) { gctl_error(req, "number of objects not given"); return (-1); } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "flags not given"); return (-1); } /* First, handle drive definitions ... */ for (i = 0; i < *drives; i++) { snprintf(buf, sizeof(buf), "drive%d", i); d2 = gctl_get_paraml(req, buf, sizeof(*d2)); if (d2 == NULL) { gctl_error(req, "no drive definition given"); return (-1); } /* * Make sure that the device specified in the drive config is * an active GEOM provider. */ pp = g_provider_by_name(d2->device); if (pp == NULL) { gctl_error(req, "%s: device not found", d2->device); goto error; } if (gv_find_drive(sc, d2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "drive '%s' already exists", d2->name); goto error; } if (gv_find_drive_device(sc, d2->device) != NULL) { gctl_error(req, "device '%s' already configured in " "gvinum", d2->device); goto error; } d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); bcopy(d2, d, sizeof(*d)); gv_post_event(sc, GV_EVENT_CREATE_DRIVE, d, NULL, 0, 0); } /* ... then volume definitions ... */ for (i = 0; i < *volumes; i++) { error = 0; snprintf(buf, sizeof(buf), "volume%d", i); v2 = gctl_get_paraml(req, buf, sizeof(*v2)); if (v2 == NULL) { gctl_error(req, "no volume definition given"); return (-1); } if (gv_find_vol(sc, v2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "volume '%s' already exists", v2->name); goto error; } v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); bcopy(v2, v, sizeof(*v)); gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); } /* ... then plex definitions ... */ for (i = 0; i < *plexes; i++) { error = 0; snprintf(buf, sizeof(buf), "plex%d", i); p2 = gctl_get_paraml(req, buf, sizeof(*p2)); if (p2 == NULL) { gctl_error(req, "no plex definition given"); return (-1); } if (gv_find_plex(sc, p2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "plex '%s' already exists", p2->name); goto error; } p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); bcopy(p2, p, sizeof(*p)); gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); } /* ... and, finally, subdisk definitions. */ for (i = 0; i < *subdisks; i++) { error = 0; snprintf(buf, sizeof(buf), "sd%d", i); s2 = gctl_get_paraml(req, buf, sizeof(*s2)); if (s2 == NULL) { gctl_error(req, "no subdisk definition given"); return (-1); } if (gv_find_sd(sc, s2->name) != NULL) { /* Ignore error. */ if (*flags & GV_FLAG_F) continue; gctl_error(req, "sd '%s' already exists", s2->name); goto error; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); bcopy(s2, s, sizeof(*s)); gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } error: gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return (0); } static void gv_config(struct gctl_req *req, struct g_class *mp, char const *verb) { struct g_geom *gp; struct gv_softc *sc; struct sbuf *sb; char *comment; g_topology_assert(); gp = LIST_FIRST(&mp->geom); sc = gp->softc; if (!strcmp(verb, "attach")) { gv_attach(sc, req); } else if (!strcmp(verb, "concat")) { gv_concat(gp, req); } else if (!strcmp(verb, "detach")) { gv_detach(sc, req); } else if (!strcmp(verb, "list")) { gv_list(gp, req); /* Save our configuration back to disk. */ } else if (!strcmp(verb, "saveconfig")) { gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Return configuration in string form. */ } else if (!strcmp(verb, "getconfig")) { comment = gctl_get_param(req, "comment", NULL); if (comment == NULL) { gctl_error(req, "no comment parameter given"); return; } sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 0, comment); sbuf_finish(sb); gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } else if (!strcmp(verb, "create")) { gv_create(gp, req); } else if (!strcmp(verb, "mirror")) { gv_mirror(gp, req); } else if (!strcmp(verb, "move")) { gv_move(gp, req); } else if (!strcmp(verb, "raid5")) { gv_raid5(gp, req); } else if (!strcmp(verb, "rebuildparity") || !strcmp(verb, "checkparity")) { gv_parityop(sc, req); } else if (!strcmp(verb, "remove")) { gv_remove(gp, req); } else if (!strcmp(verb, "rename")) { gv_rename(gp, req); } else if (!strcmp(verb, "resetconfig")) { gv_post_event(sc, GV_EVENT_RESET_CONFIG, sc, NULL, 0, 0); } else if (!strcmp(verb, "start")) { gv_start_obj(gp, req); } else if (!strcmp(verb, "stripe")) { gv_stripe(gp, req); } else if (!strcmp(verb, "setstate")) { gv_setstate(gp, req); } else gctl_error(req, "Unknown verb parameter"); } static void gv_parityop(struct gv_softc *sc, struct gctl_req *req) { struct gv_plex *p; int *flags, *rebuild, type; char *plex; plex = gctl_get_param(req, "plex", NULL); if (plex == NULL) { gctl_error(req, "no plex given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } rebuild = gctl_get_paraml(req, "rebuild", sizeof(*rebuild)); if (rebuild == NULL) { gctl_error(req, "no operation given"); return; } type = gv_object_type(sc, plex); if (type != GV_TYPE_PLEX) { gctl_error(req, "'%s' is not a plex", plex); return; } p = gv_find_plex(sc, plex); if (p->state != GV_PLEX_UP) { gctl_error(req, "plex %s is not completely accessible", p->name); return; } if (p->org != GV_PLEX_RAID5) { gctl_error(req, "plex %s is not a RAID5 plex", p->name); return; } /* Put it in the event queue. */ /* XXX: The state of the plex might have changed when this event is * picked up ... We should perhaps check this afterwards. */ if (*rebuild) gv_post_event(sc, GV_EVENT_PARITY_REBUILD, p, NULL, 0, 0); else gv_post_event(sc, GV_EVENT_PARITY_CHECK, p, NULL, 0, 0); } static struct g_geom * gv_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_geom *gp; struct g_consumer *cp; struct gv_softc *sc; struct gv_hdr vhdr; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "gv_taste(%s, %s)", mp->name, pp->name); gp = LIST_FIRST(&mp->geom); if (gp == NULL) { G_VINUM_DEBUG(0, "error: tasting, but not initialized?"); return (NULL); } sc = gp->softc; cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 0, 0) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } g_topology_unlock(); error = gv_read_header(cp, &vhdr); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); /* Check if what we've been given is a valid vinum drive. */ if (!error) gv_post_event(sc, GV_EVENT_DRIVE_TASTED, pp, NULL, 0, 0); return (NULL); } void gv_worker(void *arg) { struct g_provider *pp; struct gv_softc *sc; struct gv_event *ev; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; struct bio *bp; int newstate, flags, err, rename; char *newname; off_t offset; sc = arg; KASSERT(sc != NULL, ("NULL sc")); for (;;) { /* Look at the events first... */ ev = gv_get_event(sc); if (ev != NULL) { gv_remove_event(sc, ev); switch (ev->type) { case GV_EVENT_DRIVE_TASTED: G_VINUM_DEBUG(2, "event 'drive tasted'"); pp = ev->arg1; gv_drive_tasted(sc, pp); break; case GV_EVENT_DRIVE_LOST: G_VINUM_DEBUG(2, "event 'drive lost'"); d = ev->arg1; gv_drive_lost(sc, d); break; case GV_EVENT_CREATE_DRIVE: G_VINUM_DEBUG(2, "event 'create drive'"); d = ev->arg1; gv_create_drive(sc, d); break; case GV_EVENT_CREATE_VOLUME: G_VINUM_DEBUG(2, "event 'create volume'"); v = ev->arg1; gv_create_volume(sc, v); break; case GV_EVENT_CREATE_PLEX: G_VINUM_DEBUG(2, "event 'create plex'"); p = ev->arg1; gv_create_plex(sc, p); break; case GV_EVENT_CREATE_SD: G_VINUM_DEBUG(2, "event 'create sd'"); s = ev->arg1; gv_create_sd(sc, s); break; case GV_EVENT_RM_DRIVE: G_VINUM_DEBUG(2, "event 'remove drive'"); d = ev->arg1; flags = ev->arg3; gv_rm_drive(sc, d, flags); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_VOLUME: G_VINUM_DEBUG(2, "event 'remove volume'"); v = ev->arg1; gv_rm_vol(sc, v); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_PLEX: G_VINUM_DEBUG(2, "event 'remove plex'"); p = ev->arg1; gv_rm_plex(sc, p); /*gv_setup_objects(sc);*/ break; case GV_EVENT_RM_SD: G_VINUM_DEBUG(2, "event 'remove sd'"); s = ev->arg1; gv_rm_sd(sc, s); /*gv_setup_objects(sc);*/ break; case GV_EVENT_SAVE_CONFIG: G_VINUM_DEBUG(2, "event 'save config'"); gv_save_config(sc); break; case GV_EVENT_SET_SD_STATE: G_VINUM_DEBUG(2, "event 'setstate sd'"); s = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_sd_state(s, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting subdisk" " state: error code %d", err); break; case GV_EVENT_SET_DRIVE_STATE: G_VINUM_DEBUG(2, "event 'setstate drive'"); d = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_drive_state(d, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting drive " "state: error code %d", err); break; case GV_EVENT_SET_VOL_STATE: G_VINUM_DEBUG(2, "event 'setstate volume'"); v = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_vol_state(v, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting volume " "state: error code %d", err); break; case GV_EVENT_SET_PLEX_STATE: G_VINUM_DEBUG(2, "event 'setstate plex'"); p = ev->arg1; newstate = ev->arg3; flags = ev->arg4; err = gv_set_plex_state(p, newstate, flags); if (err) G_VINUM_DEBUG(0, "error setting plex " "state: error code %d", err); break; case GV_EVENT_SETUP_OBJECTS: G_VINUM_DEBUG(2, "event 'setup objects'"); gv_setup_objects(sc); break; case GV_EVENT_RESET_CONFIG: G_VINUM_DEBUG(2, "event 'resetconfig'"); err = gv_resetconfig(sc); if (err) G_VINUM_DEBUG(0, "error resetting " "config: error code %d", err); break; case GV_EVENT_PARITY_REBUILD: /* * Start the rebuild. The gv_plex_done will * handle issuing of the remaining rebuild bio's * until it's finished. */ G_VINUM_DEBUG(2, "event 'rebuild'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; p->flags |= GV_PLEX_REBUILDING; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK | GV_BIO_PARITY, 0); break; case GV_EVENT_PARITY_CHECK: /* Start parity check. */ G_VINUM_DEBUG(2, "event 'check'"); p = ev->arg1; if (p->state != GV_PLEX_UP) { G_VINUM_DEBUG(0, "plex %s is not " "completely accessible", p->name); break; } if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { G_VINUM_DEBUG(0, "plex %s is busy with " "syncing or parity build", p->name); break; } p->synced = 0; g_topology_assert_not(); g_topology_lock(); err = gv_access(p->vol_sc->provider, 1, 1, 0); if (err) { G_VINUM_DEBUG(0, "unable to access " "provider"); break; } g_topology_unlock(); gv_parity_request(p, GV_BIO_CHECK, 0); break; case GV_EVENT_START_PLEX: G_VINUM_DEBUG(2, "event 'start' plex"); p = ev->arg1; gv_start_plex(p); break; case GV_EVENT_START_VOLUME: G_VINUM_DEBUG(2, "event 'start' volume"); v = ev->arg1; gv_start_vol(v); break; case GV_EVENT_ATTACH_PLEX: G_VINUM_DEBUG(2, "event 'attach' plex"); p = ev->arg1; v = ev->arg2; rename = ev->arg4; err = gv_attach_plex(p, v, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", p->name, v->name, err); break; case GV_EVENT_ATTACH_SD: G_VINUM_DEBUG(2, "event 'attach' sd"); s = ev->arg1; p = ev->arg2; offset = ev->arg3; rename = ev->arg4; err = gv_attach_sd(s, p, offset, rename); if (err) G_VINUM_DEBUG(0, "error attaching %s to" " %s: error code %d", s->name, p->name, err); break; case GV_EVENT_DETACH_PLEX: G_VINUM_DEBUG(2, "event 'detach' plex"); p = ev->arg1; flags = ev->arg3; err = gv_detach_plex(p, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", p->name, err); break; case GV_EVENT_DETACH_SD: G_VINUM_DEBUG(2, "event 'detach' sd"); s = ev->arg1; flags = ev->arg3; err = gv_detach_sd(s, flags); if (err) G_VINUM_DEBUG(0, "error detaching %s: " "error code %d", s->name, err); break; case GV_EVENT_RENAME_VOL: G_VINUM_DEBUG(2, "event 'rename' volume"); v = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_vol(sc, v, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", v->name, newname, err); g_free(newname); /* Destroy and recreate the provider if we can. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(0, "unable to rename " "provider to %s: provider in use", v->name); break; } g_topology_lock(); g_wither_provider(v->provider, ENOENT); g_topology_unlock(); v->provider = NULL; gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); break; case GV_EVENT_RENAME_PLEX: G_VINUM_DEBUG(2, "event 'rename' plex"); p = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_plex(sc, p, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", p->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_SD: G_VINUM_DEBUG(2, "event 'rename' sd"); s = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_sd(sc, s, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", s->name, newname, err); g_free(newname); break; case GV_EVENT_RENAME_DRIVE: G_VINUM_DEBUG(2, "event 'rename' drive"); d = ev->arg1; newname = ev->arg2; flags = ev->arg3; err = gv_rename_drive(sc, d, newname, flags); if (err) G_VINUM_DEBUG(0, "error renaming %s to " "%s: error code %d", d->name, newname, err); g_free(newname); break; case GV_EVENT_MOVE_SD: G_VINUM_DEBUG(2, "event 'move' sd"); s = ev->arg1; d = ev->arg2; flags = ev->arg3; err = gv_move_sd(sc, s, d, flags); if (err) G_VINUM_DEBUG(0, "error moving %s to " "%s: error code %d", s->name, d->name, err); break; case GV_EVENT_THREAD_EXIT: G_VINUM_DEBUG(2, "event 'thread exit'"); g_free(ev); mtx_lock(&sc->equeue_mtx); mtx_lock(&sc->bqueue_mtx); gv_cleanup(sc); mtx_destroy(&sc->bqueue_mtx); mtx_destroy(&sc->equeue_mtx); g_free(sc->bqueue_down); g_free(sc->bqueue_up); g_free(sc); kproc_exit(0); /* NOTREACHED */ default: G_VINUM_DEBUG(1, "unknown event %d", ev->type); } g_free(ev); continue; } /* ... then do I/O processing. */ mtx_lock(&sc->bqueue_mtx); /* First do new requests. */ bp = bioq_takefirst(sc->bqueue_down); if (bp != NULL) { mtx_unlock(&sc->bqueue_mtx); /* A bio that interfered with another bio. */ if (bp->bio_pflags & GV_BIO_ONHOLD) { s = bp->bio_caller1; p = s->plex_sc; /* Is it still locked out? */ if (gv_stripe_active(p, bp)) { /* Park the bio on the waiting queue. */ bioq_disksort(p->wqueue, bp); } else { bp->bio_pflags &= ~GV_BIO_ONHOLD; g_io_request(bp, s->drive_sc->consumer); } /* A special request requireing special handling. */ } else if (bp->bio_pflags & GV_BIO_INTERNAL) { p = bp->bio_caller1; gv_plex_start(p, bp); } else { gv_volume_start(sc, bp); } mtx_lock(&sc->bqueue_mtx); } /* Then do completed requests. */ bp = bioq_takefirst(sc->bqueue_up); if (bp == NULL) { msleep(sc, &sc->bqueue_mtx, PRIBIO, "-", hz/10); mtx_unlock(&sc->bqueue_mtx); continue; } mtx_unlock(&sc->bqueue_mtx); gv_bio_done(sc, bp); } } #define VINUM_CLASS_NAME "VINUM" static struct g_class g_vinum_class = { .name = VINUM_CLASS_NAME, .version = G_VERSION, .init = gv_init, .taste = gv_taste, .ctlreq = gv_config, .destroy_geom = gv_unload, }; DECLARE_GEOM_CLASS(g_vinum_class, g_vinum); Index: head/sys/geom/vinum/geom_vinum.h =================================================================== --- head/sys/geom/vinum/geom_vinum.h (revision 326269) +++ head/sys/geom/vinum/geom_vinum.h (revision 326270) @@ -1,182 +1,184 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_VINUM_H_ #define _GEOM_VINUM_H_ /* geom_vinum_create.c */ void gv_concat(struct g_geom *gp, struct gctl_req *); void gv_mirror(struct g_geom *gp, struct gctl_req *); void gv_stripe(struct g_geom *gp, struct gctl_req *); void gv_raid5(struct g_geom *gp, struct gctl_req *); int gv_create_drive(struct gv_softc *, struct gv_drive *); int gv_create_volume(struct gv_softc *, struct gv_volume *); int gv_create_plex(struct gv_softc *, struct gv_plex *); int gv_create_sd(struct gv_softc *, struct gv_sd *); /* geom_vinum_drive.c */ void gv_save_config(struct gv_softc *); int gv_read_header(struct g_consumer *, struct gv_hdr *); int gv_write_header(struct g_consumer *, struct gv_hdr *); /* geom_vinum_init.c */ void gv_start_obj(struct g_geom *, struct gctl_req *); int gv_start_plex(struct gv_plex *); int gv_start_vol(struct gv_volume *); /* geom_vinum_list.c */ void gv_ld(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lp(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_ls(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lv(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_list(struct g_geom *, struct gctl_req *); /* geom_vinum_move.c */ void gv_move(struct g_geom *, struct gctl_req *); int gv_move_sd(struct gv_softc *, struct gv_sd *, struct gv_drive *, int); /* geom_vinum_rename.c */ void gv_rename(struct g_geom *, struct gctl_req *); int gv_rename_drive(struct gv_softc *, struct gv_drive *, char *, int); int gv_rename_plex(struct gv_softc *, struct gv_plex *, char *, int); int gv_rename_sd(struct gv_softc *, struct gv_sd *, char *, int); int gv_rename_vol(struct gv_softc *, struct gv_volume *, char *, int); /* geom_vinum_rm.c */ void gv_remove(struct g_geom *, struct gctl_req *); int gv_resetconfig(struct gv_softc *); void gv_rm_sd(struct gv_softc *sc, struct gv_sd *s); void gv_rm_drive(struct gv_softc *, struct gv_drive *, int); void gv_rm_plex(struct gv_softc *, struct gv_plex *); void gv_rm_vol(struct gv_softc *, struct gv_volume *); /* geom_vinum_state.c */ int gv_sdstatemap(struct gv_plex *); void gv_setstate(struct g_geom *, struct gctl_req *); int gv_set_drive_state(struct gv_drive *, int, int); int gv_set_sd_state(struct gv_sd *, int, int); int gv_set_vol_state(struct gv_volume *, int, int); int gv_set_plex_state(struct gv_plex *, int, int); void gv_update_sd_state(struct gv_sd *); void gv_update_plex_state(struct gv_plex *); void gv_update_vol_state(struct gv_volume *); /* geom_vinum_subr.c */ void gv_adjust_freespace(struct gv_sd *, off_t); void gv_free_sd(struct gv_sd *); struct gv_drive *gv_find_drive(struct gv_softc *, char *); struct gv_drive *gv_find_drive_device(struct gv_softc *, char *); struct gv_plex *gv_find_plex(struct gv_softc *, char *); struct gv_sd *gv_find_sd(struct gv_softc *, char *); struct gv_volume *gv_find_vol(struct gv_softc *, char *); void gv_format_config(struct gv_softc *, struct sbuf *, int, char *); int gv_is_striped(struct gv_plex *); int gv_consumer_is_open(struct g_consumer *); int gv_provider_is_open(struct g_provider *); int gv_object_type(struct gv_softc *, char *); void gv_parse_config(struct gv_softc *, char *, struct gv_drive *); int gv_sd_to_drive(struct gv_sd *, struct gv_drive *); int gv_sd_to_plex(struct gv_sd *, struct gv_plex *); int gv_sdcount(struct gv_plex *, int); void gv_update_plex_config(struct gv_plex *); void gv_update_vol_size(struct gv_volume *, off_t); off_t gv_vol_size(struct gv_volume *); off_t gv_plex_size(struct gv_plex *); int gv_plexdown(struct gv_volume *); int gv_attach_plex(struct gv_plex *, struct gv_volume *, int); int gv_attach_sd(struct gv_sd *, struct gv_plex *, off_t, int); int gv_detach_plex(struct gv_plex *, int); int gv_detach_sd(struct gv_sd *, int); /* geom_vinum.c */ void gv_worker(void *); void gv_post_event(struct gv_softc *, int, void *, void *, intmax_t, intmax_t); void gv_worker_exit(struct gv_softc *); struct gv_event *gv_get_event(struct gv_softc *); void gv_remove_event(struct gv_softc *, struct gv_event *); void gv_drive_tasted(struct gv_softc *, struct g_provider *); void gv_drive_lost(struct gv_softc *, struct gv_drive *); void gv_setup_objects(struct gv_softc *); void gv_start(struct bio *); int gv_access(struct g_provider *, int, int, int); void gv_cleanup(struct gv_softc *); /* geom_vinum_volume.c */ void gv_done(struct bio *); void gv_volume_start(struct gv_softc *, struct bio *); void gv_volume_flush(struct gv_volume *); void gv_bio_done(struct gv_softc *, struct bio *); /* geom_vinum_plex.c */ void gv_plex_start(struct gv_plex *, struct bio *); void gv_plex_raid5_done(struct gv_plex *, struct bio *); void gv_plex_normal_done(struct gv_plex *, struct bio *); int gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t); void gv_grow_complete(struct gv_plex *, struct bio *); void gv_init_request(struct gv_sd *, off_t, caddr_t, off_t); void gv_init_complete(struct gv_plex *, struct bio *); void gv_parity_request(struct gv_plex *, int, off_t); void gv_parity_complete(struct gv_plex *, struct bio *); void gv_rebuild_complete(struct gv_plex *, struct bio *); int gv_sync_request(struct gv_plex *, struct gv_plex *, off_t, off_t, int, caddr_t); int gv_sync_complete(struct gv_plex *, struct bio *); extern u_int g_vinum_debug; #define G_VINUM_DEBUG(lvl, ...) do { \ if (g_vinum_debug >= (lvl)) { \ printf("GEOM_VINUM"); \ if (g_vinum_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_VINUM_LOGREQ(lvl, bp, ...) do { \ if (g_vinum_debug >= (lvl)) { \ printf("GEOM_VINUM"); \ if (g_vinum_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #endif /* !_GEOM_VINUM_H_ */ Index: head/sys/geom/vinum/geom_vinum_create.c =================================================================== --- head/sys/geom/vinum/geom_vinum_create.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_create.c (revision 326270) @@ -1,610 +1,612 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #define DEFAULT_STRIPESIZE 262144 /* * Create a new drive object, either by user request, during taste of the drive * itself, or because it was referenced by a subdisk during taste. */ int gv_create_drive(struct gv_softc *sc, struct gv_drive *d) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; struct gv_drive *d2; struct gv_hdr *hdr; struct gv_freelist *fl; KASSERT(d != NULL, ("gv_create_drive: NULL d")); gp = sc->geom; pp = NULL; cp = cp2 = NULL; /* The drive already has a consumer if it was tasted before. */ if (d->consumer != NULL) { cp = d->consumer; cp->private = d; pp = cp->provider; } else if (!(d->flags & GV_DRIVE_REFERENCED)) { if (gv_find_drive(sc, d->name) != NULL) { G_VINUM_DEBUG(0, "drive '%s' already exists", d->name); g_free(d); return (GV_ERR_CREATE); } if (gv_find_drive_device(sc, d->device) != NULL) { G_VINUM_DEBUG(0, "provider '%s' already in use by " "gvinum", d->device); return (GV_ERR_CREATE); } pp = g_provider_by_name(d->device); if (pp == NULL) { G_VINUM_DEBUG(0, "create '%s': device '%s' disappeared", d->name, d->device); g_free(d); return (GV_ERR_CREATE); } g_topology_lock(); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "create drive '%s': unable to attach", d->name); g_free(d); return (GV_ERR_CREATE); } g_topology_unlock(); d->consumer = cp; cp->private = d; } /* * If this was just a "referenced" drive, we're almost finished, but * insert this drive not on the head of the drives list, as * gv_drive_is_newer() expects a "real" drive from LIST_FIRST(). */ if (d->flags & GV_DRIVE_REFERENCED) { snprintf(d->device, sizeof(d->device), "???"); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); return (0); } /* * Update access counts of the new drive to those of an already * existing drive. */ LIST_FOREACH(d2, &sc->drives, drive) { if ((d == d2) || (d2->consumer == NULL)) continue; cp2 = d2->consumer; g_topology_lock(); if ((cp2->acr || cp2->acw || cp2->ace) && (g_access(cp, cp2->acr, cp2->acw, cp2->ace) != 0)) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "create drive '%s': unable to update " "access counts", d->name); if (d->hdr != NULL) g_free(d->hdr); g_free(d); return (GV_ERR_CREATE); } g_topology_unlock(); break; } d->size = pp->mediasize - GV_DATA_START; d->avail = d->size; d->vinumconf = sc; LIST_INIT(&d->subdisks); LIST_INIT(&d->freelist); /* The header might have been set during taste. */ if (d->hdr == NULL) { hdr = g_malloc(sizeof(*hdr), M_WAITOK | M_ZERO); hdr->magic = GV_MAGIC; hdr->config_length = GV_CFG_LEN; getcredhostname(NULL, hdr->label.sysname, GV_HOSTNAME_LEN); strlcpy(hdr->label.name, d->name, sizeof(hdr->label.name)); microtime(&hdr->label.date_of_birth); d->hdr = hdr; } /* We also need a freelist entry. */ fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO); fl->offset = GV_DATA_START; fl->size = d->avail; LIST_INSERT_HEAD(&d->freelist, fl, freelist); d->freelist_entries = 1; if (gv_find_drive(sc, d->name) == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); gv_set_drive_state(d, GV_DRIVE_UP, 0); return (0); } int gv_create_volume(struct gv_softc *sc, struct gv_volume *v) { KASSERT(v != NULL, ("gv_create_volume: NULL v")); v->vinumconf = sc; v->flags |= GV_VOL_NEWBORN; LIST_INIT(&v->plexes); LIST_INSERT_HEAD(&sc->volumes, v, volume); v->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(v->wqueue); return (0); } int gv_create_plex(struct gv_softc *sc, struct gv_plex *p) { struct gv_volume *v; KASSERT(p != NULL, ("gv_create_plex: NULL p")); /* Find the volume this plex should be attached to. */ v = gv_find_vol(sc, p->volume); if (v == NULL) { G_VINUM_DEBUG(0, "create plex '%s': volume '%s' not found", p->name, p->volume); g_free(p); return (GV_ERR_CREATE); } if (!(v->flags & GV_VOL_NEWBORN)) p->flags |= GV_PLEX_ADDED; p->vol_sc = v; v->plexcount++; p->vinumconf = sc; p->synced = 0; p->flags |= GV_PLEX_NEWBORN; LIST_INSERT_HEAD(&v->plexes, p, in_volume); LIST_INIT(&p->subdisks); TAILQ_INIT(&p->packets); LIST_INSERT_HEAD(&sc->plexes, p, plex); p->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->bqueue); p->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->wqueue); p->rqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->rqueue); return (0); } int gv_create_sd(struct gv_softc *sc, struct gv_sd *s) { struct gv_plex *p; struct gv_drive *d; KASSERT(s != NULL, ("gv_create_sd: NULL s")); /* Find the drive where this subdisk should be put on. */ d = gv_find_drive(sc, s->drive); if (d == NULL) { /* * It's possible that the subdisk references a drive that * doesn't exist yet (during the taste process), so create a * practically empty "referenced" drive. */ if (s->flags & GV_SD_TASTED) { d = g_malloc(sizeof(struct gv_drive), M_WAITOK | M_ZERO); d->flags |= GV_DRIVE_REFERENCED; strlcpy(d->name, s->drive, sizeof(d->name)); gv_create_drive(sc, d); } else { G_VINUM_DEBUG(0, "create sd '%s': drive '%s' not found", s->name, s->drive); g_free(s); return (GV_ERR_CREATE); } } /* Find the plex where this subdisk belongs to. */ p = gv_find_plex(sc, s->plex); if (p == NULL) { G_VINUM_DEBUG(0, "create sd '%s': plex '%s' not found", s->name, s->plex); g_free(s); return (GV_ERR_CREATE); } /* * First we give the subdisk to the drive, to handle autosized * values ... */ if (gv_sd_to_drive(s, d) != 0) { g_free(s); return (GV_ERR_CREATE); } /* * Then, we give the subdisk to the plex; we check if the * given values are correct and maybe adjust them. */ if (gv_sd_to_plex(s, p) != 0) { G_VINUM_DEBUG(0, "unable to give sd '%s' to plex '%s'", s->name, p->name); if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED)) LIST_REMOVE(s, from_drive); gv_free_sd(s); g_free(s); /* * If this subdisk can't be created, we won't create * the attached plex either, if it is also a new one. */ if (!(p->flags & GV_PLEX_NEWBORN)) return (GV_ERR_CREATE); gv_rm_plex(sc, p); return (GV_ERR_CREATE); } s->flags |= GV_SD_NEWBORN; s->vinumconf = sc; LIST_INSERT_HEAD(&sc->subdisks, s, sd); return (0); } /* * Create a concatenated volume from specified drives or drivegroups. */ void gv_concat(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, dcount; sc = gp->softc; dcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_CONCAT; p->stripesize = 0; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Drives are first (right now) priority */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* * Create a mirrored volume from specified drives or drivegroups. */ void gv_mirror(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, *flags, dcount, pcount, scount; sc = gp->softc; dcount = 0; scount = 0; pcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have an even number of drives. */ if (*drives % 2 != 0) { gctl_error(req, "mirror organization must have an even number " "of drives"); return; } if (*flags & GV_FLAG_S && *drives < 4) { gctl_error(req, "must have at least 4 drives for striped plex"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plexes. */ for (pcount = 0; pcount < 2; pcount++) { p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, pcount); strlcpy(p->volume, v->name, sizeof(p->volume)); if (*flags & GV_FLAG_S) { p->org = GV_PLEX_STRIPED; p->stripesize = DEFAULT_STRIPESIZE; } else { p->org = GV_PLEX_CONCAT; p->stripesize = -1; } gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* * We just gives each even drive to plex one, and each odd to * plex two. */ scount = 0; for (dcount = pcount; dcount < *drives; dcount += 2) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s', aborting", drive); scount++; break; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, scount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); scount++; } } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } void gv_raid5(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; int *drives, *flags, dcount; char *vol, *drive, buf[30]; off_t *stripesize; sc = gp->softc; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "no stripesize given"); return; } if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have at least three drives. */ if (*drives < 3) { gctl_error(req, "must have at least three drives for this " "plex organisation"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_RAID5; p->stripesize = *stripesize; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Create subdisks on drives. */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* * Create a striped volume from specified drives or drivegroups. */ void gv_stripe(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, *flags, dcount, pcount; sc = gp->softc; dcount = 0; pcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have at least two drives. */ if (*drives < 2) { gctl_error(req, "must have at least 2 drives"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_STRIPED; p->stripesize = 262144; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Create subdisks on drives. */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } Index: head/sys/geom/vinum/geom_vinum_drive.c =================================================================== --- head/sys/geom/vinum/geom_vinum_drive.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_drive.c (revision 326270) @@ -1,352 +1,354 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2005, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #define GV_LEGACY_I386 0 #define GV_LEGACY_AMD64 1 #define GV_LEGACY_SPARC64 2 #define GV_LEGACY_POWERPC 3 static int gv_legacy_header_type(uint8_t *, int); /* * Here are the "offset (size)" for the various struct gv_hdr fields, * for the legacy i386 (or 32-bit powerpc), legacy amd64 (or sparc64), and * current (cpu & endian agnostic) versions of the on-disk format of the vinum * header structure: * * i386 amd64 current field * -------- -------- -------- ----- * 0 ( 8) 0 ( 8) 0 ( 8) magic * 8 ( 4) 8 ( 8) 8 ( 8) config_length * 12 (32) 16 (32) 16 (32) label.sysname * 44 (32) 48 (32) 48 (32) label.name * 76 ( 4) 80 ( 8) 80 ( 8) label.date_of_birth.tv_sec * 80 ( 4) 88 ( 8) 88 ( 8) label.date_of_birth.tv_usec * 84 ( 4) 96 ( 8) 96 ( 8) label.last_update.tv_sec * 88 ( 4) 104 ( 8) 104 ( 8) label.last_update.tv_usec * 92 ( 8) 112 ( 8) 112 ( 8) label.drive_size * ======== ======== ======== * 100 120 120 total size * * NOTE: i386 and amd64 formats are stored as little-endian; the current * format uses big-endian (network order). */ /* Checks for legacy format depending on platform. */ static int gv_legacy_header_type(uint8_t *hdr, int bigendian) { uint32_t *i32; int arch_32, arch_64, i; /* Set arch according to endianness. */ if (bigendian) { arch_32 = GV_LEGACY_POWERPC; arch_64 = GV_LEGACY_SPARC64; } else { arch_32 = GV_LEGACY_I386; arch_64 = GV_LEGACY_AMD64; } /* if non-empty hostname overlaps 64-bit config_length */ i32 = (uint32_t *)(hdr + 12); if (*i32 != 0) return (arch_32); /* check for non-empty hostname */ if (hdr[16] != 0) return (arch_64); /* check bytes past 32-bit structure */ for (i = 100; i < 120; i++) if (hdr[i] != 0) return (arch_32); /* check for overlapping timestamp */ i32 = (uint32_t *)(hdr + 84); if (*i32 == 0) return (arch_64); return (arch_32); } /* * Read the header while taking magic number into account, and write it to * destination pointer. */ int gv_read_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { struct g_provider *pp; uint64_t magic_machdep; uint8_t *d_hdr; int be, off; #define GV_GET32(endian) \ endian##32toh(*((uint32_t *)&d_hdr[off])); \ off += 4 #define GV_GET64(endian) \ endian##64toh(*((uint64_t *)&d_hdr[off])); \ off += 8 KASSERT(m_hdr != NULL, ("gv_read_header: null m_hdr")); KASSERT(cp != NULL, ("gv_read_header: null cp")); pp = cp->provider; KASSERT(pp != NULL, ("gv_read_header: null pp")); if ((GV_HDR_OFFSET % pp->sectorsize) != 0 || (GV_HDR_LEN % pp->sectorsize) != 0) return (ENODEV); d_hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL); if (d_hdr == NULL) return (-1); off = 0; m_hdr->magic = GV_GET64(be); magic_machdep = *((uint64_t *)&d_hdr[0]); /* * The big endian machines will have a reverse of GV_OLD_MAGIC, so we * need to decide if we are running on a big endian machine as well as * checking the magic against the reverse of GV_OLD_MAGIC. */ be = (m_hdr->magic == magic_machdep); if (m_hdr->magic == GV_MAGIC) { m_hdr->config_length = GV_GET64(be); off = 16; bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (m_hdr->magic != GV_OLD_MAGIC && m_hdr->magic != le64toh(GV_OLD_MAGIC)) { /* Not a gvinum drive. */ g_free(d_hdr); return (-1); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_SPARC64) { G_VINUM_DEBUG(1, "detected legacy sparc64 header"); m_hdr->magic = GV_MAGIC; /* Legacy sparc64 on-disk header */ m_hdr->config_length = GV_GET64(be); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(be); m_hdr->label.date_of_birth.tv_usec = GV_GET64(be); m_hdr->label.last_update.tv_sec = GV_GET64(be); m_hdr->label.last_update.tv_usec = GV_GET64(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_POWERPC) { G_VINUM_DEBUG(1, "detected legacy PowerPC header"); m_hdr->magic = GV_MAGIC; /* legacy 32-bit big endian on-disk header */ m_hdr->config_length = GV_GET32(be); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(be); m_hdr->label.date_of_birth.tv_usec = GV_GET32(be); m_hdr->label.last_update.tv_sec = GV_GET32(be); m_hdr->label.last_update.tv_usec = GV_GET32(be); m_hdr->label.drive_size = GV_GET64(be); } else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_I386) { G_VINUM_DEBUG(1, "detected legacy i386 header"); m_hdr->magic = GV_MAGIC; /* legacy i386 on-disk header */ m_hdr->config_length = GV_GET32(le); bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET32(le); m_hdr->label.date_of_birth.tv_usec = GV_GET32(le); m_hdr->label.last_update.tv_sec = GV_GET32(le); m_hdr->label.last_update.tv_usec = GV_GET32(le); m_hdr->label.drive_size = GV_GET64(le); } else { G_VINUM_DEBUG(1, "detected legacy amd64 header"); m_hdr->magic = GV_MAGIC; /* legacy amd64 on-disk header */ m_hdr->config_length = GV_GET64(le); bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; m_hdr->label.date_of_birth.tv_sec = GV_GET64(le); m_hdr->label.date_of_birth.tv_usec = GV_GET64(le); m_hdr->label.last_update.tv_sec = GV_GET64(le); m_hdr->label.last_update.tv_usec = GV_GET64(le); m_hdr->label.drive_size = GV_GET64(le); } g_free(d_hdr); return (0); } /* Write out the gvinum header. */ int gv_write_header(struct g_consumer *cp, struct gv_hdr *m_hdr) { uint8_t d_hdr[GV_HDR_LEN]; int off, ret; #define GV_SET64BE(field) \ do { \ *((uint64_t *)&d_hdr[off]) = htobe64(field); \ off += 8; \ } while (0) KASSERT(m_hdr != NULL, ("gv_write_header: null m_hdr")); off = 0; memset(d_hdr, 0, GV_HDR_LEN); GV_SET64BE(m_hdr->magic); GV_SET64BE(m_hdr->config_length); off = 16; bcopy(m_hdr->label.sysname, d_hdr + off, GV_HOSTNAME_LEN); off += GV_HOSTNAME_LEN; bcopy(m_hdr->label.name, d_hdr + off, GV_MAXDRIVENAME); off += GV_MAXDRIVENAME; GV_SET64BE(m_hdr->label.date_of_birth.tv_sec); GV_SET64BE(m_hdr->label.date_of_birth.tv_usec); GV_SET64BE(m_hdr->label.last_update.tv_sec); GV_SET64BE(m_hdr->label.last_update.tv_usec); GV_SET64BE(m_hdr->label.drive_size); ret = g_write_data(cp, GV_HDR_OFFSET, d_hdr, GV_HDR_LEN); return (ret); } /* Save the vinum configuration back to each involved disk. */ void gv_save_config(struct gv_softc *sc) { struct g_consumer *cp; struct gv_drive *d; struct gv_hdr *vhdr, *hdr; struct sbuf *sb; struct timeval last_update; int error; KASSERT(sc != NULL, ("gv_save_config: null sc")); vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); vhdr->magic = GV_MAGIC; vhdr->config_length = GV_CFG_LEN; microtime(&last_update); sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); gv_format_config(sc, sb, 1, NULL); sbuf_finish(sb); LIST_FOREACH(d, &sc->drives, drive) { /* * We can't save the config on a drive that isn't up, but * drives that were just created aren't officially up yet, so * we check a special flag. */ if (d->state != GV_DRIVE_UP) continue; cp = d->consumer; if (cp == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no consumer!", d->name); continue; } hdr = d->hdr; if (hdr == NULL) { G_VINUM_DEBUG(0, "drive '%s' has no header", d->name); g_free(vhdr); continue; } bcopy(&last_update, &hdr->label.last_update, sizeof(struct timeval)); bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); g_topology_lock(); error = g_access(cp, 0, 1, 0); if (error) { G_VINUM_DEBUG(0, "g_access failed on " "drive %s, errno %d", d->name, error); g_topology_unlock(); continue; } g_topology_unlock(); error = gv_write_header(cp, vhdr); if (error) { G_VINUM_DEBUG(0, "writing vhdr failed on drive %s, " "errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* First config copy. */ error = g_write_data(cp, GV_CFG_OFFSET, sbuf_data(sb), GV_CFG_LEN); if (error) { G_VINUM_DEBUG(0, "writing first config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); continue; } /* Second config copy. */ error = g_write_data(cp, GV_CFG_OFFSET + GV_CFG_LEN, sbuf_data(sb), GV_CFG_LEN); if (error) G_VINUM_DEBUG(0, "writing second config copy failed on " "drive %s, errno %d", d->name, error); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); } sbuf_delete(sb); g_free(vhdr); } Index: head/sys/geom/vinum/geom_vinum_events.c =================================================================== --- head/sys/geom/vinum/geom_vinum_events.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_events.c (revision 326270) @@ -1,260 +1,262 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include void gv_post_event(struct gv_softc *sc, int event, void *arg1, void *arg2, intmax_t arg3, intmax_t arg4) { struct gv_event *ev; ev = g_malloc(sizeof(*ev), M_WAITOK | M_ZERO); ev->type = event; ev->arg1 = arg1; ev->arg2 = arg2; ev->arg3 = arg3; ev->arg4 = arg4; mtx_lock(&sc->equeue_mtx); TAILQ_INSERT_TAIL(&sc->equeue, ev, events); wakeup(sc); mtx_unlock(&sc->equeue_mtx); } void gv_worker_exit(struct gv_softc *sc) { struct gv_event *ev; ev = g_malloc(sizeof(*ev), M_WAITOK | M_ZERO); ev->type = GV_EVENT_THREAD_EXIT; mtx_lock(&sc->equeue_mtx); TAILQ_INSERT_TAIL(&sc->equeue, ev, events); wakeup(sc); msleep(sc->worker, &sc->equeue_mtx, PDROP, "gv_wor", 0); } struct gv_event * gv_get_event(struct gv_softc *sc) { struct gv_event *ev; KASSERT(sc != NULL, ("NULL sc")); mtx_lock(&sc->equeue_mtx); ev = TAILQ_FIRST(&sc->equeue); mtx_unlock(&sc->equeue_mtx); return (ev); } void gv_remove_event(struct gv_softc *sc, struct gv_event *ev) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(ev != NULL, ("NULL ev")); mtx_lock(&sc->equeue_mtx); TAILQ_REMOVE(&sc->equeue, ev, events); mtx_unlock(&sc->equeue_mtx); } void gv_drive_tasted(struct gv_softc *sc, struct g_provider *pp) { struct g_geom *gp; struct g_consumer *cp; struct gv_hdr *hdr; struct gv_drive *d; char *buf; int error; hdr = NULL; buf = NULL; G_VINUM_DEBUG(2, "tasted drive on '%s'", pp->name); if ((GV_CFG_OFFSET % pp->sectorsize) != 0 || (GV_CFG_LEN % pp->sectorsize) != 0) { G_VINUM_DEBUG(0, "provider %s has unsupported sectorsize.", pp->name); return; } gp = sc->geom; g_topology_lock(); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "failed to attach to provider on taste event"); return; } if (g_access(cp, 1, 0, 0) != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "failed to access consumer on taste event"); return; } g_topology_unlock(); hdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); /* Read header and on-disk configuration. */ error = gv_read_header(cp, hdr); if (error) { G_VINUM_DEBUG(0, "failed to read header during taste"); goto failed; } /* * Setup the drive before we parse the on-disk configuration, so that * we already know about the drive then. */ d = gv_find_drive(sc, hdr->label.name); if (d == NULL) { d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); strlcpy(d->name, hdr->label.name, sizeof(d->name)); strlcpy(d->device, pp->name, sizeof(d->device)); } else if (d->flags & GV_DRIVE_REFERENCED) { strlcpy(d->device, pp->name, sizeof(d->device)); d->flags &= ~GV_DRIVE_REFERENCED; } else { G_VINUM_DEBUG(2, "drive '%s' is already known", d->name); goto failed; } /* Add the consumer and header to the new drive. */ d->consumer = cp; d->hdr = hdr; gv_create_drive(sc, d); buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, NULL); if (buf == NULL) { G_VINUM_DEBUG(0, "failed to read config during taste"); goto failed; } gv_parse_config(sc, buf, d); g_free(buf); g_topology_lock(); g_access(cp, -1, 0, 0); g_topology_unlock(); gv_setup_objects(sc); gv_set_drive_state(d, GV_DRIVE_UP, 0); return; failed: if (hdr != NULL) g_free(hdr); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } /* * When losing a drive (e.g. hardware failure), we cut down the consumer * attached to the underlying device and bring the drive itself to a * "referenced" state so that normal tasting could bring it up cleanly if it * possibly arrives again. */ void gv_drive_lost(struct gv_softc *sc, struct gv_drive *d) { struct g_consumer *cp; struct gv_drive *d2; struct gv_sd *s, *s2; struct gv_freelist *fl, *fl2; gv_set_drive_state(d, GV_DRIVE_DOWN, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); cp = d->consumer; if (cp != NULL) { if (cp->nstart != cp->nend) { G_VINUM_DEBUG(0, "dead drive '%s' has still active " "requests, unable to detach consumer", d->name); gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); return; } g_topology_lock(); if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } d->consumer = NULL; g_free(d->hdr); d->hdr = NULL; d->flags |= GV_DRIVE_REFERENCED; snprintf(d->device, sizeof(d->device), "???"); d->size = 0; d->avail = 0; d->freelist_entries = 0; d->sdcount = 0; /* Put the subdisk in tasted mode, and remove from drive list. */ LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { LIST_REMOVE(s, from_drive); s->flags |= GV_SD_TASTED; } /* * Don't forget that gv_is_newer wants a "real" drive at the beginning * of the list, so, just to be safe, we shuffle around. */ LIST_REMOVE(d, drive); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); gv_save_config(sc); } Index: head/sys/geom/vinum/geom_vinum_init.c =================================================================== --- head/sys/geom/vinum/geom_vinum_init.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_init.c (revision 326270) @@ -1,388 +1,390 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include static int gv_sync(struct gv_volume *); static int gv_rebuild_plex(struct gv_plex *); static int gv_init_plex(struct gv_plex *); static int gv_grow_plex(struct gv_plex *); static int gv_sync_plex(struct gv_plex *, struct gv_plex *); static struct gv_plex *gv_find_good_plex(struct gv_volume *); void gv_start_obj(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; int *argc, *initsize; char *argv, buf[20]; int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); initsize = gctl_get_paraml(req, "initsize", sizeof(*initsize)); if (argc == NULL || *argc == 0) { gctl_error(req, "no arguments given"); return; } sc = gp->softc; for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); argv = gctl_get_param(req, buf, NULL); if (argv == NULL) continue; type = gv_object_type(sc, argv); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); if (v != NULL) gv_post_event(sc, GV_EVENT_START_VOLUME, v, NULL, *initsize, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); if (p != NULL) gv_post_event(sc, GV_EVENT_START_PLEX, p, NULL, *initsize, 0); break; case GV_TYPE_SD: case GV_TYPE_DRIVE: /* XXX Not implemented, but what is the use? */ gctl_error(req, "unable to start '%s' - not yet supported", argv); return; default: gctl_error(req, "unknown object '%s'", argv); return; } } } int gv_start_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_plex *up; struct gv_sd *s; int error; KASSERT(p != NULL, ("gv_start_plex: NULL p")); error = 0; v = p->vol_sc; /* RAID5 plexes can either be init, rebuilt or grown. */ if (p->org == GV_PLEX_RAID5) { if (p->state > GV_PLEX_DEGRADED) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); return (error); } } } else if (p->state == GV_PLEX_DEGRADED) { error = gv_rebuild_plex(p); } else error = gv_init_plex(p); } else { /* We want to sync from the other plex if we're down. */ if (p->state == GV_PLEX_DOWN && v->plexcount > 1) { up = gv_find_good_plex(v); if (up == NULL) { G_VINUM_DEBUG(1, "unable to find a good plex"); return (ENXIO); } g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to " "access volume: %d", up->name, error); return (error); } g_topology_unlock(); error = gv_sync_plex(p, up); if (error) return (error); /* * In case we have a stripe that is up, check whether it can be * grown. */ } else if (p->org == GV_PLEX_STRIPED && p->state != GV_PLEX_DOWN) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); break; } } } } return (error); } int gv_start_vol(struct gv_volume *v) { struct gv_plex *p; int error; KASSERT(v != NULL, ("gv_start_vol: NULL v")); error = 0; if (v->plexcount == 0) return (ENXIO); else if (v->plexcount == 1) { p = LIST_FIRST(&v->plexes); KASSERT(p != NULL, ("gv_start_vol: NULL p on %s", v->name)); error = gv_start_plex(p); } else error = gv_sync(v); return (error); } /* Sync a plex p from the plex up. */ static int gv_sync_plex(struct gv_plex *p, struct gv_plex *up) { int error; KASSERT(p != NULL, ("%s: NULL p", __func__)); KASSERT(up != NULL, ("%s: NULL up", __func__)); if ((p == up) || (p->state == GV_PLEX_UP)) return (0); if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { return (EINPROGRESS); } p->synced = 0; p->flags |= GV_PLEX_SYNCING; G_VINUM_DEBUG(1, "starting sync of plex %s", p->name); error = gv_sync_request(up, p, p->synced, MIN(GV_DFLT_SYNCSIZE, up->size - p->synced), BIO_READ, NULL); if (error) { G_VINUM_DEBUG(0, "error syncing plex %s", p->name); return (error); } return (0); } /* Return a good plex from volume v. */ static struct gv_plex * gv_find_good_plex(struct gv_volume *v) { struct gv_plex *up; /* Find the plex that's up. */ up = NULL; LIST_FOREACH(up, &v->plexes, in_volume) { if (up->state == GV_PLEX_UP) break; } /* Didn't find a good plex. */ return (up); } static int gv_sync(struct gv_volume *v) { struct gv_softc *sc; struct gv_plex *p, *up; int error; KASSERT(v != NULL, ("gv_sync: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_sync: NULL sc on %s", v->name)); up = gv_find_good_plex(v); if (up == NULL) return (ENXIO); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to access volume: %d", up->name, error); return (error); } g_topology_unlock(); /* Go through the good plex, and issue BIO's to all other plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { error = gv_sync_plex(p, up); if (error) break; } return (0); } static int gv_rebuild_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) return (EINPROGRESS); /* * Make sure that all subdisks have consumers. We won't allow a rebuild * unless every subdisk have one. */ LIST_FOREACH(s, &p->subdisks, in_plex) { d = s->drive_sc; if (d == NULL || (d->flags & GV_DRIVE_REFERENCED)) { G_VINUM_DEBUG(0, "unable to rebuild %s, subdisk(s) have" " no drives", p->name); return (ENXIO); } } p->flags |= GV_PLEX_REBUILDING; p->synced = 0; g_topology_assert_not(); g_topology_lock(); error = gv_access(p->vol_sc->provider, 1, 1, 0); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (0); } g_topology_unlock(); gv_parity_request(p, GV_BIO_REBUILD, 0); return (0); } static int gv_grow_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_sd *s; off_t origsize, origlength; int error, sdcount; KASSERT(p != NULL, ("gv_grow_plex: NULL p")); v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_plex: NULL v")); if (p->flags & GV_PLEX_GROWING || p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING) return (EINPROGRESS); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (error); } /* XXX: This routine with finding origsize is used two other places as * well, so we should create a function for it. */ sdcount = p->sdcount; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } s = LIST_FIRST(&p->subdisks); if (s == NULL) { G_VINUM_DEBUG(0, "error growing plex without subdisks"); return (GV_ERR_NOTFOUND); } p->flags |= GV_PLEX_GROWING; origsize = (sdcount - 1) * s->size; origlength = (sdcount - 1) * p->stripesize; p->synced = 0; G_VINUM_DEBUG(1, "starting growing of plex %s", p->name); gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL); return (0); } static int gv_init_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; off_t start; caddr_t data; KASSERT(p != NULL, ("gv_init_plex: NULL p")); LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state == GV_SD_INITIALIZING) return (EINPROGRESS); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); s->init_size = GV_DFLT_SYNCSIZE; start = s->drive_offset + s->initialized; d = s->drive_sc; if (d == NULL) { G_VINUM_DEBUG(0, "subdisk %s has no drive yet", s->name); break; } /* * Take the lock here since we need to avoid a race in * gv_init_request if the BIO is completed before the lock is * released. */ g_topology_lock(); error = g_access(d->consumer, 0, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "error accessing consumer when " "initializing %s", s->name); break; } data = g_malloc(s->init_size, M_WAITOK | M_ZERO); gv_init_request(s, start, data, s->init_size); } return (0); } Index: head/sys/geom/vinum/geom_vinum_list.c =================================================================== --- head/sys/geom/vinum/geom_vinum_list.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_list.c (revision 326270) @@ -1,501 +1,503 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include void gv_lvi(struct gv_volume *, struct sbuf *, int); void gv_lpi(struct gv_plex *, struct sbuf *, int); void gv_lsi(struct gv_sd *, struct sbuf *, int); void gv_ldi(struct gv_drive *, struct sbuf *, int); void gv_list(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d; struct gv_plex *p; struct gv_sd *s; struct gv_volume *v; struct sbuf *sb; int *argc, i, *flags, type; char *arg, buf[20], *cmd; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); if (argc == NULL) { gctl_error(req, "no arguments given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } sc = gp->softc; sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); /* Figure out which command was given. */ cmd = gctl_get_param(req, "cmd", NULL); if (cmd == NULL) { gctl_error(req, "no command given"); return; } /* List specific objects or everything. */ if (!strcmp(cmd, "list") || !strcmp(cmd, "l")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, arg); gv_lvi(v, sb, *flags); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, arg); gv_lpi(p, sb, *flags); break; case GV_TYPE_SD: s = gv_find_sd(sc, arg); gv_lsi(s, sb, *flags); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, arg); gv_ldi(d, sb, *flags); break; default: gctl_error(req, "unknown object '%s'", arg); break; } } } else { gv_ld(gp, req, sb); sbuf_printf(sb, "\n"); gv_lv(gp, req, sb); sbuf_printf(sb, "\n"); gv_lp(gp, req, sb); sbuf_printf(sb, "\n"); gv_ls(gp, req, sb); } /* List drives. */ } else if (!strcmp(cmd, "ld")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_DRIVE) { gctl_error(req, "'%s' is not a drive", arg); continue; } else { d = gv_find_drive(sc, arg); gv_ldi(d, sb, *flags); } } } else gv_ld(gp, req, sb); /* List volumes. */ } else if (!strcmp(cmd, "lv")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_VOL) { gctl_error(req, "'%s' is not a volume", arg); continue; } else { v = gv_find_vol(sc, arg); gv_lvi(v, sb, *flags); } } } else gv_lv(gp, req, sb); /* List plexes. */ } else if (!strcmp(cmd, "lp")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_PLEX) { gctl_error(req, "'%s' is not a plex", arg); continue; } else { p = gv_find_plex(sc, arg); gv_lpi(p, sb, *flags); } } } else gv_lp(gp, req, sb); /* List subdisks. */ } else if (!strcmp(cmd, "ls")) { if (*argc) { for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); arg = gctl_get_param(req, buf, NULL); if (arg == NULL) continue; type = gv_object_type(sc, arg); if (type != GV_TYPE_SD) { gctl_error(req, "'%s' is not a subdisk", arg); continue; } else { s = gv_find_sd(sc, arg); gv_lsi(s, sb, *flags); } } } else gv_ls(gp, req, sb); } else gctl_error(req, "unknown command '%s'", cmd); sbuf_finish(sb); gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } /* List one or more volumes. */ void gv_lv(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_volume *v; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(v, &sc->volumes, volume) i++; sbuf_printf(sb, "%d volume%s:\n", i, i == 1 ? "" : "s"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(v, &sc->volumes, volume) gv_lvi(v, sb, *flags); } } /* List a single volume. */ void gv_lvi(struct gv_volume *v, struct sbuf *sb, int flags) { struct gv_plex *p; int i; if (flags & GV_FLAG_V) { sbuf_printf(sb, "Volume %s:\tSize: %jd bytes (%jd MB)\n", v->name, (intmax_t)v->size, (intmax_t)v->size / MEGABYTE); sbuf_printf(sb, "\t\tState: %s\n", gv_volstate(v->state)); } else { sbuf_printf(sb, "V %-21s State: %s\tPlexes: %7d\tSize: %s\n", v->name, gv_volstate(v->state), v->plexcount, gv_roughlength(v->size, 0)); } if (flags & GV_FLAG_VV) { i = 0; LIST_FOREACH(p, &v->plexes, in_volume) { sbuf_printf(sb, "\t\tPlex %2d:\t%s\t(%s), %s\n", i, p->name, gv_plexstate(p->state), gv_roughlength(p->size, 0)); i++; } } if (flags & GV_FLAG_R) { LIST_FOREACH(p, &v->plexes, in_volume) gv_lpi(p, sb, flags); } } /* List one or more plexes. */ void gv_lp(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_plex *p; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(p, &sc->plexes, plex) i++; sbuf_printf(sb, "%d plex%s:\n", i, i == 1 ? "" : "es"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(p, &sc->plexes, plex) gv_lpi(p, sb, *flags); } } /* List a single plex. */ void gv_lpi(struct gv_plex *p, struct sbuf *sb, int flags) { struct gv_sd *s; int i; if (flags & GV_FLAG_V) { sbuf_printf(sb, "Plex %s:\tSize:\t%9jd bytes (%jd MB)\n", p->name, (intmax_t)p->size, (intmax_t)p->size / MEGABYTE); sbuf_printf(sb, "\t\tSubdisks: %8d\n", p->sdcount); sbuf_printf(sb, "\t\tState: %s\n", gv_plexstate(p->state)); if ((p->flags & GV_PLEX_SYNCING) || (p->flags & GV_PLEX_GROWING) || (p->flags & GV_PLEX_REBUILDING)) { sbuf_printf(sb, "\t\tSynced: "); sbuf_printf(sb, "%16jd bytes (%d%%)\n", (intmax_t)p->synced, (p->size > 0) ? (int)((p->synced * 100) / p->size) : 0); } sbuf_printf(sb, "\t\tOrganization: %s", gv_plexorg(p->org)); if (gv_is_striped(p)) { sbuf_printf(sb, "\tStripe size: %s\n", gv_roughlength(p->stripesize, 1)); } sbuf_printf(sb, "\t\tFlags: %d\n", p->flags); if (p->vol_sc != NULL) { sbuf_printf(sb, "\t\tPart of volume %s\n", p->volume); } } else { sbuf_printf(sb, "P %-18s %2s State: ", p->name, gv_plexorg_short(p->org)); if ((p->flags & GV_PLEX_SYNCING) || (p->flags & GV_PLEX_GROWING) || (p->flags & GV_PLEX_REBUILDING)) { sbuf_printf(sb, "S %d%%\t", (int)((p->synced * 100) / p->size)); } else { sbuf_printf(sb, "%s\t", gv_plexstate(p->state)); } sbuf_printf(sb, "Subdisks: %5d\tSize: %s\n", p->sdcount, gv_roughlength(p->size, 0)); } if (flags & GV_FLAG_VV) { i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { sbuf_printf(sb, "\t\tSubdisk %d:\t%s\n", i, s->name); sbuf_printf(sb, "\t\t state: %s\tsize %11jd " "(%jd MB)\n", gv_sdstate(s->state), (intmax_t)s->size, (intmax_t)s->size / MEGABYTE); if (p->org == GV_PLEX_CONCAT) { sbuf_printf(sb, "\t\t\toffset %9jd (0x%jx)\n", (intmax_t)s->plex_offset, (intmax_t)s->plex_offset); } i++; } } if (flags & GV_FLAG_R) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_lsi(s, sb, flags); } } /* List one or more subdisks. */ void gv_ls(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_sd *s; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(s, &sc->subdisks, sd) i++; sbuf_printf(sb, "%d subdisk%s:\n", i, i == 1 ? "" : "s"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(s, &sc->subdisks, sd) gv_lsi(s, sb, *flags); } } /* List a single subdisk. */ void gv_lsi(struct gv_sd *s, struct sbuf *sb, int flags) { if (flags & GV_FLAG_V) { sbuf_printf(sb, "Subdisk %s:\n", s->name); sbuf_printf(sb, "\t\tSize: %16jd bytes (%jd MB)\n", (intmax_t)s->size, (intmax_t)s->size / MEGABYTE); sbuf_printf(sb, "\t\tState: %s\n", gv_sdstate(s->state)); if (s->state == GV_SD_INITIALIZING || s->state == GV_SD_REVIVING) { if (s->state == GV_SD_INITIALIZING) sbuf_printf(sb, "\t\tInitialized: "); else sbuf_printf(sb, "\t\tRevived: "); sbuf_printf(sb, "%16jd bytes (%d%%)\n", (intmax_t)s->initialized, (int)((s->initialized * 100) / s->size)); } if (s->plex_sc != NULL) { sbuf_printf(sb, "\t\tPlex %s at offset %jd (%s)\n", s->plex, (intmax_t)s->plex_offset, gv_roughlength(s->plex_offset, 1)); } sbuf_printf(sb, "\t\tDrive %s (%s) at offset %jd (%s)\n", s->drive, s->drive_sc == NULL ? "*missing*" : s->drive_sc->name, (intmax_t)s->drive_offset, gv_roughlength(s->drive_offset, 1)); sbuf_printf(sb, "\t\tFlags: %d\n", s->flags); } else { sbuf_printf(sb, "S %-21s State: ", s->name); if (s->state == GV_SD_INITIALIZING || s->state == GV_SD_REVIVING) { if (s->state == GV_SD_INITIALIZING) sbuf_printf(sb, "I "); else sbuf_printf(sb, "R "); sbuf_printf(sb, "%d%%\t", (int)((s->initialized * 100) / s->size)); } else { sbuf_printf(sb, "%s\t", gv_sdstate(s->state)); } sbuf_printf(sb, "D: %-12s Size: %s\n", s->drive, gv_roughlength(s->size, 0)); } } /* List one or more drives. */ void gv_ld(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) { struct gv_softc *sc; struct gv_drive *d; int i, *flags; sc = gp->softc; i = 0; LIST_FOREACH(d, &sc->drives, drive) i++; sbuf_printf(sb, "%d drive%s:\n", i, i == 1 ? "" : "s"); if (i) { flags = gctl_get_paraml(req, "flags", sizeof(*flags)); LIST_FOREACH(d, &sc->drives, drive) gv_ldi(d, sb, *flags); } } /* List a single drive. */ void gv_ldi(struct gv_drive *d, struct sbuf *sb, int flags) { struct gv_freelist *fl; struct gv_sd *s; /* Verbose listing. */ if (flags & GV_FLAG_V) { sbuf_printf(sb, "Drive %s:\tDevice %s\n", d->name, d->device); sbuf_printf(sb, "\t\tSize: %16jd bytes (%jd MB)\n", (intmax_t)d->size, (intmax_t)d->size / MEGABYTE); sbuf_printf(sb, "\t\tUsed: %16jd bytes (%jd MB)\n", (intmax_t)d->size - d->avail, (intmax_t)(d->size - d->avail) / MEGABYTE); sbuf_printf(sb, "\t\tAvailable: %11jd bytes (%jd MB)\n", (intmax_t)d->avail, (intmax_t)d->avail / MEGABYTE); sbuf_printf(sb, "\t\tState: %s\n", gv_drivestate(d->state)); sbuf_printf(sb, "\t\tFlags: %d\n", d->flags); /* Be very verbose. */ if (flags & GV_FLAG_VV) { sbuf_printf(sb, "\t\tFree list contains %d entries:\n", d->freelist_entries); sbuf_printf(sb, "\t\t Offset\t Size\n"); LIST_FOREACH(fl, &d->freelist, freelist) sbuf_printf(sb, "\t\t%9jd\t%9jd\n", (intmax_t)fl->offset, (intmax_t)fl->size); } } else { sbuf_printf(sb, "D %-21s State: %s\t/dev/%s\tA: %jd/%jd MB " "(%d%%)\n", d->name, gv_drivestate(d->state), d->device, (intmax_t)d->avail / MEGABYTE, (intmax_t)d->size / MEGABYTE, d->size > 0 ? (int)((d->avail * 100) / d->size) : 0); } /* Recursive listing. */ if (flags & GV_FLAG_R) { LIST_FOREACH(s, &d->subdisks, from_drive) gv_lsi(s, sb, flags); } } Index: head/sys/geom/vinum/geom_vinum_move.c =================================================================== --- head/sys/geom/vinum/geom_vinum_move.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_move.c (revision 326270) @@ -1,188 +1,190 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Chris Jones * All rights reserved. * * This software was developed for the FreeBSD Project by Chris Jones * thanks to the support of Google's Summer of Code program and * mentoring by Lukas Ertl. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include void gv_move(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_sd *s; struct gv_drive *d; char buf[20], *destination, *object; int *argc, *flags, i, type; sc = gp->softc; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); if (argc == NULL) { gctl_error(req, "no arguments given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } destination = gctl_get_param(req, "destination", NULL); if (destination == NULL) { gctl_error(req, "no destination given"); return; } if (gv_object_type(sc, destination) != GV_TYPE_DRIVE) { gctl_error(req, "destination '%s' is not a drive", destination); return; } d = gv_find_drive(sc, destination); /* * We start with 1 here, because argv[0] on the command line is the * destination drive. */ for (i = 1; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); object = gctl_get_param(req, buf, NULL); if (object == NULL) continue; type = gv_object_type(sc, object); if (type != GV_TYPE_SD) { gctl_error(req, "you can only move subdisks; " "'%s' is not a subdisk", object); return; } s = gv_find_sd(sc, object); if (s == NULL) { gctl_error(req, "unknown subdisk '%s'", object); return; } gv_post_event(sc, GV_EVENT_MOVE_SD, s, d, *flags, 0); } } /* Move a subdisk. */ int gv_move_sd(struct gv_softc *sc, struct gv_sd *cursd, struct gv_drive *destination, int flags) { struct gv_drive *d; struct gv_sd *newsd, *s, *s2; struct gv_plex *p; int err; g_topology_assert(); KASSERT(cursd != NULL, ("gv_move_sd: NULL cursd")); KASSERT(destination != NULL, ("gv_move_sd: NULL destination")); d = cursd->drive_sc; if ((gv_consumer_is_open(d->consumer) || gv_consumer_is_open(destination->consumer)) && !(flags & GV_FLAG_F)) { G_VINUM_DEBUG(0, "consumers on current and destination drive " " still open"); return (GV_ERR_ISBUSY); } if (!(flags & GV_FLAG_F)) { G_VINUM_DEBUG(1, "-f flag not passed; move would be " "destructive"); return (GV_ERR_INVFLAG); } if (destination == cursd->drive_sc) { G_VINUM_DEBUG(1, "subdisk '%s' already on drive '%s'", cursd->name, destination->name); return (GV_ERR_ISATTACHED); } /* XXX: Does it have to be part of a plex? */ p = gv_find_plex(sc, cursd->plex); if (p == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' is not part of a plex", cursd->name); return (GV_ERR_NOTFOUND); } /* Stale the old subdisk. */ err = gv_set_sd_state(cursd, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); if (err) { G_VINUM_DEBUG(0, "unable to set the subdisk '%s' to state " "'stale'", cursd->name); return (err); } /* * Create new subdisk. Ideally, we'd use gv_new_sd, but that requires * us to create a string for it to parse, which is silly. * TODO: maybe refactor gv_new_sd such that this is no longer the case. */ newsd = g_malloc(sizeof(struct gv_sd), M_WAITOK | M_ZERO); newsd->plex_offset = cursd->plex_offset; newsd->size = cursd->size; newsd->drive_offset = -1; strlcpy(newsd->name, cursd->name, sizeof(newsd->name)); strlcpy(newsd->drive, destination->name, sizeof(newsd->drive)); strlcpy(newsd->plex, cursd->plex, sizeof(newsd->plex)); newsd->state = GV_SD_STALE; newsd->vinumconf = cursd->vinumconf; err = gv_sd_to_drive(newsd, destination); if (err) { /* XXX not enough free space? */ g_free(newsd); return (err); } /* Replace the old sd by the new one. */ LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) { if (s == cursd) { gv_rm_sd(sc, s); } } gv_sd_to_plex(newsd, p); LIST_INSERT_HEAD(&sc->subdisks, newsd, sd); /* Update volume size of plex. */ if (p->vol_sc != NULL) gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc)); gv_save_config(p->vinumconf); return (0); } Index: head/sys/geom/vinum/geom_vinum_plex.c =================================================================== --- head/sys/geom/vinum/geom_vinum_plex.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_plex.c (revision 326270) @@ -1,1048 +1,1050 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include static int gv_check_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static int gv_normal_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static void gv_plex_flush(struct gv_plex *); static int gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int); static int gv_plex_normal_request(struct gv_plex *, struct bio *, off_t, off_t, caddr_t); static void gv_post_bio(struct gv_softc *, struct bio *); void gv_plex_start(struct gv_plex *p, struct bio *bp) { struct bio *cbp; struct gv_sd *s; struct gv_raid5_packet *wp; caddr_t addr; off_t bcount, boff, len; bcount = bp->bio_length; addr = bp->bio_data; boff = bp->bio_offset; /* Walk over the whole length of the request, we might split it up. */ while (bcount > 0) { wp = NULL; /* * RAID5 plexes need special treatment, as a single request * might involve several read/write sub-requests. */ if (p->org == GV_PLEX_RAID5) { wp = gv_raid5_start(p, bp, addr, boff, bcount); if (wp == NULL) return; len = wp->length; if (TAILQ_EMPTY(&wp->bits)) g_free(wp); else if (wp->lockbase != -1) TAILQ_INSERT_TAIL(&p->packets, wp, list); /* * Requests to concatenated and striped plexes go straight * through. */ } else { len = gv_plex_normal_request(p, bp, boff, bcount, addr); } if (len < 0) return; bcount -= len; addr += len; boff += len; } /* * Fire off all sub-requests. We get the correct consumer (== drive) * to send each request to via the subdisk that was stored in * cbp->bio_caller1. */ cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { /* * RAID5 sub-requests need to come in correct order, otherwise * we trip over the parity, as it might be overwritten by * another sub-request. We abuse cbp->bio_caller2 to mark * potential overlap situations. */ if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) { /* Park the bio on the waiting queue. */ cbp->bio_pflags |= GV_BIO_ONHOLD; bioq_disksort(p->wqueue, cbp); } else { s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); } cbp = bioq_takefirst(p->bqueue); } } static int gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int growing) { struct gv_sd *s; int i, sdcount; off_t len_left, stripeend, stripeno, stripestart; switch (p->org) { case GV_PLEX_CONCAT: /* * Find the subdisk where this request starts. The subdisks in * this list must be ordered by plex_offset. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->plex_offset <= boff && s->plex_offset + s->size > boff) { *sdno = i; break; } i++; } if (s == NULL || s->drive_sc == NULL) return (GV_ERR_NOTFOUND); /* Calculate corresponding offsets on disk. */ *real_off = boff - s->plex_offset; len_left = s->size - (*real_off); KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount > len_left) ? len_left : bcount; break; case GV_PLEX_STRIPED: /* The number of the stripe where the request starts. */ stripeno = boff / p->stripesize; KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0")); /* Take growing subdisks into account when calculating. */ sdcount = gv_sdcount(p, (boff >= p->synced)); if (!(boff + bcount <= p->synced) && (p->flags & GV_PLEX_GROWING) && !growing) return (GV_ERR_ISBUSY); *sdno = stripeno % sdcount; KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0")); stripestart = (stripeno / sdcount) * p->stripesize; KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0")); stripeend = stripestart + p->stripesize; *real_off = boff - (stripeno * p->stripesize) + stripestart; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; break; default: return (GV_ERR_PLEXORG); } return (0); } /* * Prepare a normal plex request. */ static int gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff, off_t bcount, caddr_t addr) { struct gv_sd *s; struct bio *cbp; off_t real_len, real_off; int i, err, sdno; s = NULL; sdno = -1; real_len = real_off = 0; err = ENXIO; if (p == NULL || LIST_EMPTY(&p->subdisks)) goto bad; err = gv_plex_offset(p, boff, bcount, &real_off, &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW)); /* If the request was blocked, put it into wait. */ if (err == GV_ERR_ISBUSY) { bioq_disksort(p->rqueue, bp); return (-1); /* "Fail", and delay request. */ } if (err) { err = ENXIO; goto bad; } err = ENXIO; /* Find the right subdisk. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) break; i++; } /* Subdisk not found. */ if (s == NULL || s->drive_sc == NULL) goto bad; /* Now check if we can handle the request on this subdisk. */ switch (s->state) { case GV_SD_UP: /* If the subdisk is up, just continue. */ break; case GV_SD_DOWN: if (bp->bio_pflags & GV_BIO_INTERNAL) G_VINUM_DEBUG(0, "subdisk must be in the stale state in" " order to perform administrative requests"); goto bad; case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) { G_VINUM_DEBUG(0, "subdisk stale, unable to perform " "regular requests"); goto bad; } G_VINUM_DEBUG(1, "sd %s is initializing", s->name); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); break; case GV_SD_INITIALIZING: if (bp->bio_cmd == BIO_READ) goto bad; break; default: /* All other subdisk states mean it's not accessible. */ goto bad; } /* Clone the bio and adjust the offsets and sizes. */ cbp = g_clone_bio(bp); if (cbp == NULL) { err = ENOMEM; goto bad; } cbp->bio_offset = real_off + s->drive_offset; cbp->bio_length = real_len; cbp->bio_data = addr; cbp->bio_done = gv_done; cbp->bio_caller1 = s; /* Store the sub-requests now and let others issue them. */ bioq_insert_tail(p->bqueue, cbp); return (real_len); bad: G_VINUM_LOGREQ(0, bp, "plex request failed."); /* Building the sub-request failed. If internal BIO, do not deliver. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (-1); } g_io_deliver(bp, err); return (-1); } /* * Handle a completed request to a striped or concatenated plex. */ void gv_plex_normal_done(struct gv_plex *p, struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { /* Just set it to length since multiple plexes will * screw things up. */ pbp->bio_completed = pbp->bio_length; if (pbp->bio_pflags & GV_BIO_SYNCREQ) gv_sync_complete(p, pbp); else if (pbp->bio_pflags & GV_BIO_GROW) gv_grow_complete(p, pbp); else g_io_deliver(pbp, pbp->bio_error); } } /* * Handle a completed request to a RAID-5 plex. */ void gv_plex_raid5_done(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct bio *cbp, *pbp; struct gv_bioq *bq, *bq2; struct gv_raid5_packet *wp; off_t completed; int i; completed = 0; sc = p->vinumconf; wp = bp->bio_caller2; switch (bp->bio_parent->bio_cmd) { case BIO_READ: if (wp == NULL) { completed = bp->bio_completed; break; } TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); for (i = 0; i < wp->length; i++) wp->data[i] ^= bp->bio_data[i]; break; } if (TAILQ_EMPTY(&wp->bits)) { completed = wp->length; if (wp->lockbase != -1) { TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } } g_free(wp); } break; case BIO_WRITE: /* XXX can this ever happen? */ if (wp == NULL) { completed = bp->bio_completed; break; } /* Check if we need to handle parity data. */ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); cbp = wp->parity; if (cbp != NULL) { for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= bp->bio_data[i]; } break; } /* Handle parity data. */ if (TAILQ_EMPTY(&wp->bits)) { if (bp->bio_parent->bio_pflags & GV_BIO_CHECK) i = gv_check_parity(p, bp, wp); else i = gv_normal_parity(p, bp, wp); /* All of our sub-requests have finished. */ if (i) { completed = wp->length; TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } g_free(wp); } } break; } pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += completed; /* When the original request is finished, we deliver it. */ pbp->bio_inbed++; if (pbp->bio_inbed == pbp->bio_children) { /* Hand it over for checking or delivery. */ if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_CHECK)) { gv_parity_complete(p, pbp); } else if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_REBUILD)) { gv_rebuild_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_INIT) { gv_init_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_SYNCREQ) { gv_sync_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_GROW) { gv_grow_complete(p, pbp); } else { g_io_deliver(pbp, pbp->bio_error); } } /* Clean up what we allocated. */ if (bp->bio_cflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); } static int gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *pbp; struct gv_sd *s; int err, finished, i; err = 0; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { pbp = wp->parity; wp->parity = NULL; /* Check if the parity is correct. */ for (i = 0; i < wp->length; i++) { if (bp->bio_data[i] != pbp->bio_data[i]) { err = 1; break; } } /* The parity is not correct... */ if (err) { bp->bio_parent->bio_error = EAGAIN; /* ... but we rebuild it. */ if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) { s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } } /* * Clean up the BIO we would have used for rebuilding the * parity. */ if (finished) { bp->bio_parent->bio_inbed++; g_destroy_bio(pbp); } } return (finished); } static int gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *cbp, *pbp; struct gv_sd *s; int finished, i; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; cbp = wp->parity; for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= pbp->bio_data[i]; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { cbp = wp->parity; wp->parity = NULL; s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); finished = 0; } return (finished); } /* Flush the queue with delayed requests. */ static void gv_plex_flush(struct gv_plex *p) { struct gv_softc *sc; struct bio *bp; sc = p->vinumconf; bp = bioq_takefirst(p->rqueue); while (bp != NULL) { gv_plex_start(p, bp); bp = bioq_takefirst(p->rqueue); } } static void gv_post_bio(struct gv_softc *sc, struct bio *bp) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(bp != NULL, ("NULL bp")); mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(from != NULL, ("NULL from")); KASSERT(to != NULL, ("NULL to")); sc = from->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "sync from '%s' failed at offset " " %jd; out of memory", from->name, offset); return (ENOMEM); } bp->bio_length = length; bp->bio_done = gv_done; bp->bio_pflags |= GV_BIO_SYNCREQ; bp->bio_offset = offset; bp->bio_caller1 = from; bp->bio_caller2 = to; bp->bio_cmd = type; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */ bp->bio_data = data; /* Send down next. */ gv_post_bio(sc, bp); //gv_plex_start(from, bp); return (0); } /* * Handle a finished plex sync bio. */ int gv_sync_complete(struct gv_plex *to, struct bio *bp) { struct gv_plex *from, *p; struct gv_sd *s; struct gv_volume *v; struct gv_softc *sc; off_t offset; int err; g_topology_assert_not(); err = 0; KASSERT(to != NULL, ("NULL to")); KASSERT(bp != NULL, ("NULL bp")); from = bp->bio_caller2; KASSERT(from != NULL, ("NULL from")); v = to->vol_sc; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read the next one. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); to->synced += bp->bio_length; /* If we're finished, clean up. */ if (bp->bio_offset + bp->bio_length >= from->size) { G_VINUM_DEBUG(1, "syncing of %s from %s completed", to->name, from->name); /* Update our state. */ LIST_FOREACH(s, &to->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, 0); gv_update_plex_state(to); to->flags &= ~GV_PLEX_SYNCING; to->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } else { offset = bp->bio_offset + bp->bio_length; err = gv_sync_request(from, to, offset, MIN(bp->bio_length, from->size - offset), BIO_READ, NULL); } } g_destroy_bio(bp); /* Clean up if there was an error. */ if (err) { to->flags &= ~GV_PLEX_SYNCING; G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err); } /* Check if all plexes are synced, and lower refcounts. */ g_topology_lock(); LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { g_topology_unlock(); return (-1); } } /* If we came here, all plexes are synced, and we're free. */ gv_access(v->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "plex sync completed"); gv_volume_flush(v); return (0); } /* * Create a new bio struct for the next grow request. */ int gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_grow_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_grow_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "grow of %s failed creating bio: " "out of memory", p->name); return (ENOMEM); } bp->bio_cmd = type; bp->bio_done = gv_done; bp->bio_error = 0; bp->bio_caller1 = p; bp->bio_offset = offset; bp->bio_length = length; bp->bio_pflags |= GV_BIO_GROW; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; bp->bio_data = data; gv_post_bio(sc, bp); //gv_plex_start(p, bp); return (0); } /* * Finish handling of a bio to a growing plex. */ void gv_grow_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; struct gv_volume *v; off_t origsize, offset; int sdcount, err; v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_complete: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_grow_complete: NULL sc")); err = 0; /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { p->synced += bp->bio_length; err = gv_grow_request(p, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read next. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); /* Find the real size of the plex. */ sdcount = gv_sdcount(p, 1); s = LIST_FIRST(&p->subdisks); KASSERT(s != NULL, ("NULL s")); origsize = (s->size * (sdcount - 1)); if (bp->bio_offset + bp->bio_length >= origsize) { G_VINUM_DEBUG(1, "growing of %s completed", p->name); p->flags &= ~GV_PLEX_GROWING; LIST_FOREACH(s, &p->subdisks, in_plex) { s->flags &= ~GV_SD_GROW; gv_set_sd_state(s, GV_SD_UP, 0); } p->size = gv_plex_size(p); gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); g_topology_lock(); gv_access(v->provider, -1, -1, 0); g_topology_unlock(); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Issue delayed requests. */ gv_plex_flush(p); } else { offset = bp->bio_offset + bp->bio_length; err = gv_grow_request(p, offset, MIN(bp->bio_length, origsize - offset), BIO_READ, NULL); } } g_destroy_bio(bp); if (err) { p->flags &= ~GV_PLEX_GROWING; G_VINUM_DEBUG(0, "error growing plex: error code %d", err); } } /* * Create an initialization BIO and send it off to the consumer. Assume that * we're given initialization data as parameter. */ void gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length) { struct gv_drive *d; struct g_consumer *cp; struct bio *bp, *cbp; KASSERT(s != NULL, ("gv_init_request: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_request: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_request: NULL cp")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } bp->bio_cmd = BIO_WRITE; bp->bio_data = data; bp->bio_done = gv_done; bp->bio_error = 0; bp->bio_length = length; bp->bio_pflags |= GV_BIO_INIT; bp->bio_offset = start; bp->bio_caller1 = s; /* Then ofcourse, we have to clone it. */ cbp = g_clone_bio(bp); if (cbp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } cbp->bio_done = gv_done; cbp->bio_caller1 = s; /* Send it off to the consumer. */ g_io_request(cbp, cp); } /* * Handle a finished initialization BIO. */ void gv_init_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_drive *d; struct g_consumer *cp; struct gv_sd *s; off_t start, length; caddr_t data; int error; s = bp->bio_caller1; start = bp->bio_offset; length = bp->bio_length; error = bp->bio_error; data = bp->bio_data; KASSERT(s != NULL, ("gv_init_complete: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_complete: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_complete: NULL cp")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_init_complete: NULL sc")); g_destroy_bio(bp); /* * First we need to find out if it was okay, and abort if it's not. * Then we need to free previous buffers, find out the correct subdisk, * as well as getting the correct starting point and length of the BIO. */ if (start >= s->drive_offset + s->size) { /* Free the data we initialized. */ if (data != NULL) g_free(data); g_topology_assert_not(); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); if (error) { gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); } else { gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); s->initialized = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); G_VINUM_DEBUG(1, "subdisk '%s' init: finished " "successfully", s->name); } return; } s->initialized += length; start += length; gv_init_request(s, start, data, length); } /* * Create a new bio struct for the next parity rebuild. Used both by internal * rebuild of degraded plexes as well as user initiated rebuilds/checks. */ void gv_parity_request(struct gv_plex *p, int flags, off_t offset) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_parity_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: " "out of memory", p->name); return; } bp->bio_cmd = BIO_WRITE; bp->bio_done = gv_done; bp->bio_error = 0; bp->bio_length = p->stripesize; bp->bio_caller1 = p; /* * Check if it's a rebuild of a degraded plex or a user request of * parity rebuild. */ if (flags & GV_BIO_REBUILD) bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK); else if (flags & GV_BIO_CHECK) bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO); else { G_VINUM_DEBUG(0, "invalid flags given in rebuild"); return; } bp->bio_pflags = flags; bp->bio_pflags |= GV_BIO_MALLOC; /* We still have more parity to build. */ bp->bio_offset = offset; gv_post_bio(sc, bp); //gv_plex_start(p, bp); /* Send it down to the plex. */ } /* * Handle a finished parity write. */ void gv_parity_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; int error, flags; error = bp->bio_error; flags = bp->bio_pflags; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error == EAGAIN) { G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx", (intmax_t)p->synced); } /* Any error is fatal, except EAGAIN when we're rebuilding. */ if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx " "errno %d", p->name, (intmax_t)p->synced, error); return; } else { p->synced += p->stripesize; } if (p->synced >= p->size) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); /* We're finished. */ G_VINUM_DEBUG(1, "parity operation on %s finished", p->name); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, p->synced); } /* * Handle a finished plex rebuild bio. */ void gv_rebuild_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; int error, flags; off_t offset; error = bp->bio_error; flags = bp->bio_pflags; offset = bp->bio_offset; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error) { g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d", p->name, (intmax_t)offset, error); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } offset += (p->stripesize * (gv_sdcount(p, 1) - 1)); if (offset >= p->size) { /* We're finished. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "rebuild of %s finished", p->name); gv_save_config(p->vinumconf); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; /* Try to up all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) gv_update_sd_state(s); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, offset); } Index: head/sys/geom/vinum/geom_vinum_raid5.c =================================================================== --- head/sys/geom/vinum/geom_vinum_raid5.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_raid5.c (revision 326270) @@ -1,661 +1,663 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include static int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int *, int); static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, struct gv_raid5_packet *, caddr_t, int); static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t, int *); static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); struct gv_raid5_packet * gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct bio *cbp; struct gv_raid5_packet *wp, *wp2; struct gv_bioq *bq, *bq2; int err, delay; delay = 0; wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); wp->bio = bp; wp->waiting = NULL; wp->parity = NULL; TAILQ_INIT(&wp->bits); if (bp->bio_pflags & GV_BIO_REBUILD) err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); else if (bp->bio_pflags & GV_BIO_CHECK) err = gv_raid5_check(p, wp, bp, addr, boff, bcount); else err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); /* Means we have a delayed request. */ if (delay) { g_free(wp); return (NULL); } /* * Building the sub-request failed, we probably need to clean up a lot. */ if (err) { G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } if (wp->waiting != NULL) { if (wp->waiting->bio_cflags & GV_BIO_MALLOC) g_free(wp->waiting->bio_data); g_destroy_bio(wp->waiting); } if (wp->parity != NULL) { if (wp->parity->bio_cflags & GV_BIO_MALLOC) g_free(wp->parity->bio_data); g_destroy_bio(wp->parity); } g_free(wp); TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { if (wp->bio != bp) continue; TAILQ_REMOVE(&p->packets, wp, list); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } g_free(wp); } cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { if (cbp->bio_cflags & GV_BIO_MALLOC) g_free(cbp->bio_data); g_destroy_bio(cbp); cbp = bioq_takefirst(p->bqueue); } /* If internal, stop and reset state. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); /* Reset flags. */ p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (NULL); } g_io_deliver(bp, err); return (NULL); } return (wp); } /* * Check if the stripe that the work packet wants is already being used by * some other work packet. */ int gv_stripe_active(struct gv_plex *p, struct bio *bp) { struct gv_raid5_packet *wp, *owp; int overlap; wp = bp->bio_caller2; if (wp->lockbase == -1) return (0); overlap = 0; TAILQ_FOREACH(owp, &p->packets, list) { if (owp == wp) break; if ((wp->lockbase >= owp->lockbase) && (wp->lockbase <= owp->lockbase + owp->length)) { overlap++; break; } if ((wp->lockbase <= owp->lockbase) && (wp->lockbase + wp->length >= owp->lockbase)) { overlap++; break; } } return (overlap); } static int gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); /* Find the right subdisk. */ parity = NULL; i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == psdno) { parity = s; break; } i++; } /* Parity stripe not found. */ if (parity == NULL) return (ENXIO); if (parity->state != GV_SD_UP) return (ENXIO); wp->length = real_len; wp->data = addr; wp->lockbase = real_off; /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the parity subdisk. */ if (s == parity) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Read the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; wp->waiting = cbp; /* * In case we want to rebuild the parity, create an extra BIO to write * it out. It also acts as buffer for the XOR operations. */ cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; return (0); } /* Rebuild a degraded RAID5 plex. */ static int gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *broken, *s; struct gv_bioq *bq; struct bio *cbp; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); /* Find the right subdisk. */ broken = NULL; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_UP) broken = s; } /* Broken stripe not found. */ if (broken == NULL) return (ENXIO); switch (broken->state) { case GV_SD_UP: return (EINVAL); case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_REBUILD)) return (ENXIO); G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); /* Set this bit now, but should be set at end. */ broken->flags |= GV_SD_CANGOUP; break; case GV_SD_REVIVING: break; default: /* All other subdisk states mean it's not accessible. */ return (ENXIO); } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; p->synced = boff; /* Post notification that we're finished. */ return (0); } /* Build a request group to perform (part of) a RAID5 request. */ static int gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) { struct g_geom *gp; struct gv_sd *broken, *original, *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno, sdno, type, grow; off_t real_len, real_off; gp = bp->bio_to->geom; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); /* We are optimistic and assume that this request will be OK. */ #define REQ_TYPE_NORMAL 0 #define REQ_TYPE_DEGRADED 1 #define REQ_TYPE_NOPARITY 2 type = REQ_TYPE_NORMAL; original = parity = broken = NULL; /* XXX: The resize won't crash with rebuild or sync, but we should still * be aware of it. Also this should perhaps be done on rebuild/check as * well? */ /* If we're over, we must use the old. */ if (boff >= p->synced) { grow = 1; /* Or if over the resized offset, we use all drives. */ } else if (boff + bcount <= p->synced) { grow = 0; /* Else, we're in the middle, and must wait a bit. */ } else { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno, grow); /* Find the right subdisks. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) original = s; if (i == psdno) parity = s; if (s->state != GV_SD_UP) broken = s; i++; } if ((original == NULL) || (parity == NULL)) return (ENXIO); /* Our data stripe is missing. */ if (original->state != GV_SD_UP) type = REQ_TYPE_DEGRADED; /* If synchronizing request, just write it if disks are stale. */ if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { type = REQ_TYPE_NORMAL; /* Our parity stripe is missing. */ } else if (parity->state != GV_SD_UP) { /* We cannot take another failure if we're already degraded. */ if (type != REQ_TYPE_NORMAL) return (ENXIO); else type = REQ_TYPE_NOPARITY; } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) type = REQ_TYPE_NORMAL; if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } switch (bp->bio_cmd) { case BIO_READ: /* * For a degraded read we need to read in all stripes except * the broken one plus the parity stripe and then recalculate * the desired data. */ if (type == REQ_TYPE_DEGRADED) { bzero(wp->data, wp->length); LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* A normal read can be fulfilled with the original subdisk. */ } else { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); } wp->lockbase = -1; break; case BIO_WRITE: /* * A degraded write means we cannot write to the original data * subdisk. Thus we need to read in all valid stripes, * recalculate the parity from the original data, and then * write the parity stripe back out. */ if (type == REQ_TYPE_DEGRADED) { /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken and the parity subdisk. */ if ((s == broken) || (s == parity)) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bcopy(addr, cbp->bio_data, wp->length); wp->parity = cbp; /* * When the parity stripe is missing we just write out the data. */ } else if (type == REQ_TYPE_NOPARITY) { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* * A normal write request goes to the original subdisk, then we * read in all other stripes, recalculate the parity and write * out the parity again. */ } else { /* Read old parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Read old data. */ cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Write new data. */ cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); /* * We must not write the new data until the old data * was read, so hold this BIO back until we're ready * for it. */ wp->waiting = cbp; /* The final bio for the parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); /* Remember that this is the BIO for the parity data. */ wp->parity = cbp; } break; default: return (EINVAL); } return (0); } /* * Calculate the offsets in the various subdisks for a RAID5 request. Also take * care of new subdisks in an expanded RAID5 array. * XXX: This assumes that the new subdisks are inserted after the others (which * is okay as long as plex_offset is larger). If subdisks are inserted into the * plexlist before, we get problems. */ static int gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int *psdno, int growing) { struct gv_sd *s; int sd, psd, sdcount; off_t len_left, stripeend, stripeoff, stripestart; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } /* The number of the subdisk containing the parity stripe. */ psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % sdcount; KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); /* Offset of the start address from the start of the stripe. */ stripeoff = boff % (p->stripesize * (sdcount - 1)); KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); /* The number of the subdisk where the stripe resides. */ sd = stripeoff / p->stripesize; KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); /* At or past parity subdisk. */ if (sd >= psd) sd++; /* The offset of the stripe on this subdisk. */ stripestart = (boff - stripeoff) / (sdcount - 1); KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); stripeoff %= p->stripesize; /* The offset of the request on this subdisk. */ *real_off = stripestart + stripeoff; stripeend = stripestart + p->stripesize; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; if (sdno != NULL) *sdno = sd; if (psdno != NULL) *psdno = psd; return (0); } static struct bio * gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, caddr_t addr, int use_wp) { struct bio *cbp; cbp = g_clone_bio(bp); if (cbp == NULL) return (NULL); if (addr == NULL) { cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); cbp->bio_cflags |= GV_BIO_MALLOC; } else cbp->bio_data = addr; cbp->bio_offset = wp->lockbase + s->drive_offset; cbp->bio_length = wp->length; cbp->bio_done = gv_done; cbp->bio_caller1 = s; if (use_wp) cbp->bio_caller2 = wp; return (cbp); } Index: head/sys/geom/vinum/geom_vinum_raid5.h =================================================================== --- head/sys/geom/vinum/geom_vinum_raid5.h (revision 326269) +++ head/sys/geom/vinum/geom_vinum_raid5.h (revision 326270) @@ -1,55 +1,57 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_VINUM_RAID5_H_ #define _GEOM_VINUM_RAID5_H_ /* * A single RAID5 request usually needs more than one I/O transaction, * depending on the state of the associated subdisks and the direction of the * transaction (read or write). */ struct gv_raid5_packet { caddr_t data; /* Data buffer of this sub-request- */ off_t length; /* Size of data buffer. */ off_t lockbase; /* Deny access to our plex offset. */ struct bio *bio; /* Pointer to the original bio. */ struct bio *parity; /* The bio containing the parity data. */ struct bio *waiting; /* A bio that need to wait for other bios. */ TAILQ_HEAD(,gv_bioq) bits; /* List of subrequests. */ TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */ }; struct gv_raid5_packet * gv_raid5_start(struct gv_plex *, struct bio *, caddr_t, off_t, off_t); int gv_stripe_active(struct gv_plex *, struct bio *); #endif /* !_GEOM_VINUM_RAID5_H_ */ Index: head/sys/geom/vinum/geom_vinum_rename.c =================================================================== --- head/sys/geom/vinum/geom_vinum_rename.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_rename.c (revision 326270) @@ -1,261 +1,263 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Chris Jones * All rights reserved. * * This software was developed for the FreeBSD Project by Chris Jones * thanks to the support of Google's Summer of Code program and * mentoring by Lukas Ertl. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include void gv_rename(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; char *newname, *object, *name; int *flags, type; sc = gp->softc; flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } newname = gctl_get_param(req, "newname", NULL); if (newname == NULL) { gctl_error(req, "no new name given"); return; } object = gctl_get_param(req, "object", NULL); if (object == NULL) { gctl_error(req, "no object given"); return; } type = gv_object_type(sc, object); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, object); if (v == NULL) { gctl_error(req, "unknown volume '%s'", object); return; } name = g_malloc(GV_MAXVOLNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXVOLNAME); gv_post_event(sc, GV_EVENT_RENAME_VOL, v, name, *flags, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, object); if (p == NULL) { gctl_error(req, "unknown plex '%s'", object); return; } name = g_malloc(GV_MAXPLEXNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXPLEXNAME); gv_post_event(sc, GV_EVENT_RENAME_PLEX, p, name, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); if (s == NULL) { gctl_error(req, "unknown subdisk '%s'", object); return; } name = g_malloc(GV_MAXSDNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXSDNAME); gv_post_event(sc, GV_EVENT_RENAME_SD, s, name, *flags, 0); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, object); if (d == NULL) { gctl_error(req, "unknown drive '%s'", object); return; } name = g_malloc(GV_MAXDRIVENAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXDRIVENAME); gv_post_event(sc, GV_EVENT_RENAME_DRIVE, d, name, *flags, 0); break; default: gctl_error(req, "unknown object '%s'", object); return; } } int gv_rename_drive(struct gv_softc *sc, struct gv_drive *d, char *newname, int flags) { struct gv_sd *s; KASSERT(d != NULL, ("gv_rename_drive: NULL d")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "drive name '%s' already in use", newname); return (GV_ERR_NAMETAKEN); } strlcpy(d->name, newname, sizeof(d->name)); if (d->hdr != NULL) strlcpy(d->hdr->label.name, newname, sizeof(d->hdr->label.name)); LIST_FOREACH(s, &d->subdisks, from_drive) strlcpy(s->drive, d->name, sizeof(s->drive)); return (0); } int gv_rename_plex(struct gv_softc *sc, struct gv_plex *p, char *newname, int flags) { char newsd[GV_MAXSDNAME]; struct gv_sd *s; char *ptr; int err; KASSERT(p != NULL, ("gv_rename_plex: NULL p")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "plex name '%s' already in use", newname); return (GV_ERR_NAMETAKEN); } /* * Locate the plex number part of the plex names. * XXX: might be a good idea to sanitize input a bit more */ ptr = strrchr(newname, '.'); if (ptr == NULL) { G_VINUM_DEBUG(0, "proposed plex name '%s' is not a valid plex " "name", newname); return (GV_ERR_INVNAME); } strlcpy(p->name, newname, sizeof(p->name)); /* Fix up references and potentially rename subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { strlcpy(s->plex, p->name, sizeof(s->plex)); if (flags & GV_FLAG_R) { /* * Look for the two last dots in the string, and assume * that the old value was ok. */ ptr = strrchr(s->name, '.'); if (ptr == NULL) return (GV_ERR_INVNAME); ptr++; snprintf(newsd, sizeof(newsd), "%s.%s", p->name, ptr); err = gv_rename_sd(sc, s, newsd, flags); if (err) return (err); } } return (0); } /* * gv_rename_sd: renames a subdisk. Note that the 'flags' argument is ignored, * since there are no structures below a subdisk. Similarly, we don't have to * clean up any references elsewhere to the subdisk's name. */ int gv_rename_sd(struct gv_softc *sc, struct gv_sd *s, char *newname, int flags) { char *dot1, *dot2; KASSERT(s != NULL, ("gv_rename_sd: NULL s")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "subdisk name %s already in use", newname); return (GV_ERR_NAMETAKEN); } /* Locate the sd number part of the sd names. */ dot1 = strchr(newname, '.'); if (dot1 == NULL || (dot2 = strchr(dot1 + 1, '.')) == NULL) { G_VINUM_DEBUG(0, "proposed sd name '%s' is not a valid sd name", newname); return (GV_ERR_INVNAME); } strlcpy(s->name, newname, sizeof(s->name)); return (0); } int gv_rename_vol(struct gv_softc *sc, struct gv_volume *v, char *newname, int flags) { struct g_provider *pp; struct gv_plex *p; char newplex[GV_MAXPLEXNAME], *ptr; int err; KASSERT(v != NULL, ("gv_rename_vol: NULL v")); pp = v->provider; KASSERT(pp != NULL, ("gv_rename_vol: NULL pp")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "volume name %s already in use", newname); return (GV_ERR_NAMETAKEN); } /* Rename the volume. */ strlcpy(v->name, newname, sizeof(v->name)); /* Fix up references and potentially rename plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { strlcpy(p->volume, v->name, sizeof(p->volume)); if (flags & GV_FLAG_R) { /* * Look for the last dot in the string, and assume that * the old value was ok. */ ptr = strrchr(p->name, '.'); ptr++; snprintf(newplex, sizeof(newplex), "%s.%s", v->name, ptr); err = gv_rename_plex(sc, p, newplex, flags); if (err) return (err); } } return (0); } Index: head/sys/geom/vinum/geom_vinum_rm.c =================================================================== --- head/sys/geom/vinum/geom_vinum_rm.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_rm.c (revision 326270) @@ -1,387 +1,389 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include /* General 'remove' routine. */ void gv_remove(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; int *argc, *flags; char *argv, buf[20]; int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); if (argc == NULL || *argc == 0) { gctl_error(req, "no arguments given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } sc = gp->softc; /* XXX config locking */ for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); argv = gctl_get_param(req, buf, NULL); if (argv == NULL) continue; type = gv_object_type(sc, argv); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); /* * If this volume has plexes, we want a recursive * removal. */ if (!LIST_EMPTY(&v->plexes) && !(*flags & GV_FLAG_R)) { gctl_error(req, "volume '%s' has attached " "plexes - need recursive removal", v->name); return; } gv_post_event(sc, GV_EVENT_RM_VOLUME, v, NULL, 0, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); /* * If this plex has subdisks, we want a recursive * removal. */ if (!LIST_EMPTY(&p->subdisks) && !(*flags & GV_FLAG_R)) { gctl_error(req, "plex '%s' has attached " "subdisks - need recursive removal", p->name); return; } /* Don't allow removal of the only plex of a volume. */ if (p->vol_sc != NULL && p->vol_sc->plexcount == 1) { gctl_error(req, "plex '%s' is still attached " "to volume '%s'", p->name, p->volume); return; } gv_post_event(sc, GV_EVENT_RM_PLEX, p, NULL, 0, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, argv); /* Don't allow removal if attached to a plex. */ if (s->plex_sc != NULL) { gctl_error(req, "subdisk '%s' is still attached" " to plex '%s'", s->name, s->plex_sc->name); return; } gv_post_event(sc, GV_EVENT_RM_SD, s, NULL, 0, 0); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, argv); /* We don't allow to remove open drives. */ if (gv_consumer_is_open(d->consumer) && !(*flags & GV_FLAG_F)) { gctl_error(req, "drive '%s' is open", d->name); return; } /* A drive with subdisks needs a recursive removal. */ /* if (!LIST_EMPTY(&d->subdisks) && !(*flags & GV_FLAG_R)) { gctl_error(req, "drive '%s' still has subdisks" " - need recursive removal", d->name); return; }*/ gv_post_event(sc, GV_EVENT_RM_DRIVE, d, NULL, *flags, 0); break; default: gctl_error(req, "unknown object '%s'", argv); return; } } gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* Resets configuration */ int gv_resetconfig(struct gv_softc *sc) { struct gv_drive *d, *d2; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; /* First make sure nothing is open. */ LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { if (gv_consumer_is_open(d->consumer)) { return (GV_ERR_ISBUSY); } } /* Make sure nothing is going on internally. */ LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { if (p->flags & (GV_PLEX_REBUILDING | GV_PLEX_GROWING)) return (GV_ERR_ISBUSY); } /* Then if not, we remove everything. */ LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) gv_rm_sd(sc, s); LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) gv_rm_drive(sc, d, 0); LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) gv_rm_plex(sc, p); LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) gv_rm_vol(sc, v); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return (0); } /* Remove a volume. */ void gv_rm_vol(struct gv_softc *sc, struct gv_volume *v) { struct g_provider *pp; struct gv_plex *p, *p2; KASSERT(v != NULL, ("gv_rm_vol: NULL v")); pp = v->provider; KASSERT(pp != NULL, ("gv_rm_vol: NULL pp")); /* Check if any of our consumers is open. */ if (gv_provider_is_open(pp)) { G_VINUM_DEBUG(0, "unable to remove %s: volume still in use", v->name); return; } /* Remove the plexes our volume has. */ LIST_FOREACH_SAFE(p, &v->plexes, in_volume, p2) gv_rm_plex(sc, p); /* Clean up. */ LIST_REMOVE(v, volume); g_free(v); /* Get rid of the volume's provider. */ if (pp != NULL) { g_topology_lock(); g_wither_provider(pp, ENXIO); g_topology_unlock(); } } /* Remove a plex. */ void gv_rm_plex(struct gv_softc *sc, struct gv_plex *p) { struct gv_volume *v; struct gv_sd *s, *s2; KASSERT(p != NULL, ("gv_rm_plex: NULL p")); v = p->vol_sc; /* Check if any of our consumers is open. */ if (v != NULL && gv_provider_is_open(v->provider) && v->plexcount < 2) { G_VINUM_DEBUG(0, "unable to remove %s: volume still in use", p->name); return; } /* Remove the subdisks our plex has. */ LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) gv_rm_sd(sc, s); v = p->vol_sc; /* Clean up and let our geom fade away. */ LIST_REMOVE(p, plex); if (p->vol_sc != NULL) { p->vol_sc->plexcount--; LIST_REMOVE(p, in_volume); p->vol_sc = NULL; /* Correctly update the volume size. */ gv_update_vol_size(v, gv_vol_size(v)); } g_free(p); } /* Remove a subdisk. */ void gv_rm_sd(struct gv_softc *sc, struct gv_sd *s) { struct gv_plex *p; struct gv_volume *v; KASSERT(s != NULL, ("gv_rm_sd: NULL s")); p = s->plex_sc; v = NULL; /* Clean up. */ if (p != NULL) { LIST_REMOVE(s, in_plex); s->plex_sc = NULL; p->sdcount--; /* Update the plexsize. */ p->size = gv_plex_size(p); v = p->vol_sc; if (v != NULL) { /* Update the size of our plex' volume. */ gv_update_vol_size(v, gv_vol_size(v)); } } if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED)) LIST_REMOVE(s, from_drive); LIST_REMOVE(s, sd); gv_free_sd(s); g_free(s); } /* Remove a drive. */ void gv_rm_drive(struct gv_softc *sc, struct gv_drive *d, int flags) { struct g_consumer *cp; struct gv_freelist *fl, *fl2; struct gv_plex *p; struct gv_sd *s, *s2; struct gv_volume *v; struct gv_drive *d2; int err; KASSERT(d != NULL, ("gv_rm_drive: NULL d")); cp = d->consumer; if (cp != NULL) { g_topology_lock(); err = g_access(cp, 0, 1, 0); g_topology_unlock(); if (err) { G_VINUM_DEBUG(0, "%s: unable to access '%s', " "errno: %d", __func__, cp->provider->name, err); return; } /* Clear the Vinum Magic. */ d->hdr->magic = GV_NOMAGIC; err = gv_write_header(cp, d->hdr); if (err) G_VINUM_DEBUG(0, "gv_rm_drive: error writing header to" " '%s', errno: %d", cp->provider->name, err); g_topology_lock(); g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } /* Remove all associated subdisks, plexes, volumes. */ if (flags & GV_FLAG_R) { if (!LIST_EMPTY(&d->subdisks)) { LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { p = s->plex_sc; if (p != NULL) { v = p->vol_sc; if (v != NULL) gv_rm_vol(sc, v); } } } } /* Clean up. */ LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } LIST_REMOVE(d, drive); g_free(d->hdr); /* Put ourself into referenced state if we have subdisks. */ if (d->sdcount > 0) { d->consumer = NULL; d->hdr = NULL; d->flags |= GV_DRIVE_REFERENCED; snprintf(d->device, sizeof(d->device), "???"); d->size = 0; d->avail = 0; d->freelist_entries = 0; LIST_FOREACH(s, &d->subdisks, from_drive) { s->flags |= GV_SD_TASTED; gv_set_sd_state(s, GV_SD_DOWN, GV_SETSTATE_FORCE); } /* Shuffle around so we keep gv_is_newer happy. */ LIST_REMOVE(d, drive); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); return; } g_free(d); gv_save_config(sc); } Index: head/sys/geom/vinum/geom_vinum_share.h =================================================================== --- head/sys/geom/vinum/geom_vinum_share.h (revision 326269) +++ head/sys/geom/vinum/geom_vinum_share.h (revision 326270) @@ -1,67 +1,69 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_VINUM_SHARE_H_ #define _GEOM_VINUM_SHARE_H_ /* Maximum number of arguments for a single command. */ #define GV_MAXARGS 64 enum { KILOBYTE = 1024, MEGABYTE = 1048576, GIGABYTE = 1073741824 }; off_t gv_sizespec(char *); int gv_tokenize(char *, char **, int); struct gv_sd *gv_alloc_sd(void); struct gv_volume *gv_alloc_volume(void); struct gv_plex *gv_alloc_plex(void); struct gv_drive *gv_alloc_drive(void); struct gv_drive *gv_new_drive(int, char **); struct gv_plex *gv_new_plex(int, char **); struct gv_sd *gv_new_sd(int, char **); struct gv_volume *gv_new_volume(int, char **); int gv_drivestatei(char *); int gv_plexorgi(char *); int gv_plexstatei(char *); int gv_sdstatei(char *); int gv_volstatei(char *); const char *gv_drivestate(int); const char *gv_plexorg(int); const char *gv_plexorg_short(int); const char *gv_plexstate(int); const char *gv_sdstate(int); const char *gv_volstate(int); const char *gv_roughlength(off_t, int); #endif /* _GEOM_VINUM_SHARE_H_ */ Index: head/sys/geom/vinum/geom_vinum_state.c =================================================================== --- head/sys/geom/vinum/geom_vinum_state.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_state.c (revision 326270) @@ -1,534 +1,536 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include void gv_setstate(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_sd *s; struct gv_drive *d; struct gv_volume *v; struct gv_plex *p; char *obj, *state; int f, *flags, type; f = 0; obj = gctl_get_param(req, "object", NULL); if (obj == NULL) { gctl_error(req, "no object given"); return; } state = gctl_get_param(req, "state", NULL); if (state == NULL) { gctl_error(req, "no state given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } if (*flags & GV_FLAG_F) f = GV_SETSTATE_FORCE; sc = gp->softc; type = gv_object_type(sc, obj); switch (type) { case GV_TYPE_VOL: if (gv_volstatei(state) < 0) { gctl_error(req, "invalid volume state '%s'", state); break; } v = gv_find_vol(sc, obj); gv_post_event(sc, GV_EVENT_SET_VOL_STATE, v, NULL, gv_volstatei(state), f); break; case GV_TYPE_PLEX: if (gv_plexstatei(state) < 0) { gctl_error(req, "invalid plex state '%s'", state); break; } p = gv_find_plex(sc, obj); gv_post_event(sc, GV_EVENT_SET_PLEX_STATE, p, NULL, gv_plexstatei(state), f); break; case GV_TYPE_SD: if (gv_sdstatei(state) < 0) { gctl_error(req, "invalid subdisk state '%s'", state); break; } s = gv_find_sd(sc, obj); gv_post_event(sc, GV_EVENT_SET_SD_STATE, s, NULL, gv_sdstatei(state), f); break; case GV_TYPE_DRIVE: if (gv_drivestatei(state) < 0) { gctl_error(req, "invalid drive state '%s'", state); break; } d = gv_find_drive(sc, obj); gv_post_event(sc, GV_EVENT_SET_DRIVE_STATE, d, NULL, gv_drivestatei(state), f); break; default: gctl_error(req, "unknown object '%s'", obj); break; } } /* Update drive state; return 0 if the state changes, otherwise error. */ int gv_set_drive_state(struct gv_drive *d, int newstate, int flags) { struct gv_sd *s; int oldstate; KASSERT(d != NULL, ("gv_set_drive_state: NULL d")); oldstate = d->state; if (newstate == oldstate) return (0); /* We allow to take down an open drive only with force. */ if ((newstate == GV_DRIVE_DOWN) && gv_consumer_is_open(d->consumer) && (!(flags & GV_SETSTATE_FORCE))) return (GV_ERR_ISBUSY); d->state = newstate; if (d->state != oldstate) { LIST_FOREACH(s, &d->subdisks, from_drive) gv_update_sd_state(s); } /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(d->vinumconf); return (0); } int gv_set_sd_state(struct gv_sd *s, int newstate, int flags) { struct gv_drive *d; struct gv_plex *p; int oldstate, status; KASSERT(s != NULL, ("gv_set_sd_state: NULL s")); oldstate = s->state; /* We are optimistic and assume it will work. */ status = 0; if (newstate == oldstate) return (0); switch (newstate) { case GV_SD_DOWN: /* * If we're attached to a plex, we won't go down without use of * force. */ if ((s->plex_sc != NULL) && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_ISATTACHED); break; case GV_SD_REVIVING: case GV_SD_INITIALIZING: /* * Only do this if we're forced, since it usually is done * internally, and then we do use the force flag. */ if (!(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); break; case GV_SD_UP: /* We can't bring the subdisk up if our drive is dead. */ d = s->drive_sc; if ((d == NULL) || (d->state != GV_DRIVE_UP)) return (GV_ERR_SETSTATE); /* Check from where we want to be brought up. */ switch (s->state) { case GV_SD_REVIVING: case GV_SD_INITIALIZING: /* * The subdisk was initializing. We allow it to be * brought up. */ break; case GV_SD_DOWN: /* * The subdisk is currently down. We allow it to be * brought up if it is not attached to a plex. */ p = s->plex_sc; if (p == NULL) break; /* * If this subdisk is attached to a plex, we allow it * to be brought up if the plex if it's not a RAID5 * plex, otherwise it's made 'stale'. */ if (p->org != GV_PLEX_RAID5) break; else if (s->flags & GV_SD_CANGOUP) { s->flags &= ~GV_SD_CANGOUP; break; } else if (flags & GV_SETSTATE_FORCE) break; else s->state = GV_SD_STALE; status = GV_ERR_SETSTATE; break; case GV_SD_STALE: /* * A stale subdisk can be brought up only if it's part * of a concat or striped plex that's the only one in a * volume, or if the subdisk isn't attached to a plex. * Otherwise it needs to be revived or initialized * first. */ p = s->plex_sc; if (p == NULL || flags & GV_SETSTATE_FORCE) break; if ((p->org != GV_PLEX_RAID5 && p->vol_sc->plexcount == 1) || (p->flags & GV_PLEX_SYNCING && p->synced > 0 && p->org == GV_PLEX_RAID5)) break; else return (GV_ERR_SETSTATE); default: return (GV_ERR_INVSTATE); } break; /* Other state transitions are only possible with force. */ default: if (!(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); } /* We can change the state and do it. */ if (status == 0) s->state = newstate; /* Update our plex, if we're attached to one. */ if (s->plex_sc != NULL) gv_update_plex_state(s->plex_sc); /* Save the config back to disk. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(s->vinumconf); return (status); } int gv_set_plex_state(struct gv_plex *p, int newstate, int flags) { struct gv_volume *v; int oldstate, plexdown; KASSERT(p != NULL, ("gv_set_plex_state: NULL p")); oldstate = p->state; v = p->vol_sc; plexdown = 0; if (newstate == oldstate) return (0); switch (newstate) { case GV_PLEX_UP: /* Let update_plex handle if the plex can come up */ gv_update_plex_state(p); if (p->state != GV_PLEX_UP && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); p->state = newstate; break; case GV_PLEX_DOWN: /* * Set state to GV_PLEX_DOWN only if no-one is using the plex, * or if the state is forced. */ if (v != NULL) { /* If the only one up, force is needed. */ plexdown = gv_plexdown(v); if ((v->plexcount == 1 || (v->plexcount - plexdown == 1)) && ((flags & GV_SETSTATE_FORCE) == 0)) return (GV_ERR_SETSTATE); } p->state = newstate; break; case GV_PLEX_DEGRADED: /* Only used internally, so we have to be forced. */ if (flags & GV_SETSTATE_FORCE) p->state = newstate; break; } /* Update our volume if we have one. */ if (v != NULL) gv_update_vol_state(v); /* Save config. */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(p->vinumconf); return (0); } int gv_set_vol_state(struct gv_volume *v, int newstate, int flags) { int oldstate; KASSERT(v != NULL, ("gv_set_vol_state: NULL v")); oldstate = v->state; if (newstate == oldstate) return (0); switch (newstate) { case GV_VOL_UP: /* Let update handle if the volume can come up. */ gv_update_vol_state(v); if (v->state != GV_VOL_UP && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_SETSTATE); v->state = newstate; break; case GV_VOL_DOWN: /* * Set state to GV_VOL_DOWN only if no-one is using the volume, * or if the state should be forced. */ if (!gv_provider_is_open(v->provider) && !(flags & GV_SETSTATE_FORCE)) return (GV_ERR_ISBUSY); v->state = newstate; break; } /* Save config */ if (flags & GV_SETSTATE_CONFIG) gv_save_config(v->vinumconf); return (0); } /* Update the state of a subdisk based on its environment. */ void gv_update_sd_state(struct gv_sd *s) { struct gv_drive *d; int oldstate; KASSERT(s != NULL, ("gv_update_sd_state: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_update_sd_state: NULL d")); oldstate = s->state; /* If our drive isn't up we cannot be up either. */ if (d->state != GV_DRIVE_UP) { s->state = GV_SD_DOWN; /* If this subdisk was just created, we assume it is good.*/ } else if (s->flags & GV_SD_NEWBORN) { s->state = GV_SD_UP; s->flags &= ~GV_SD_NEWBORN; } else if (s->state != GV_SD_UP) { if (s->flags & GV_SD_CANGOUP) { s->state = GV_SD_UP; s->flags &= ~GV_SD_CANGOUP; } else s->state = GV_SD_STALE; } else s->state = GV_SD_UP; if (s->state != oldstate) G_VINUM_DEBUG(1, "subdisk %s state change: %s -> %s", s->name, gv_sdstate(oldstate), gv_sdstate(s->state)); /* Update the plex, if we have one. */ if (s->plex_sc != NULL) gv_update_plex_state(s->plex_sc); } /* Update the state of a plex based on its environment. */ void gv_update_plex_state(struct gv_plex *p) { struct gv_sd *s; int sdstates; int oldstate; KASSERT(p != NULL, ("gv_update_plex_state: NULL p")); oldstate = p->state; /* First, check the state of our subdisks. */ sdstates = gv_sdstatemap(p); /* If all subdisks are up, our plex can be up, too. */ if (sdstates == GV_SD_UPSTATE) p->state = GV_PLEX_UP; /* One or more of our subdisks are down. */ else if (sdstates & GV_SD_DOWNSTATE) { /* A RAID5 plex can handle one dead subdisk. */ if ((p->org == GV_PLEX_RAID5) && (p->sddown == 1)) p->state = GV_PLEX_DEGRADED; else p->state = GV_PLEX_DOWN; /* Some of our subdisks are initializing. */ } else if (sdstates & GV_SD_INITSTATE) { if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING) p->state = GV_PLEX_DEGRADED; else p->state = GV_PLEX_DOWN; } else p->state = GV_PLEX_DOWN; if (p->state == GV_PLEX_UP) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { p->state = GV_PLEX_GROWABLE; break; } } } if (p->state != oldstate) G_VINUM_DEBUG(1, "plex %s state change: %s -> %s", p->name, gv_plexstate(oldstate), gv_plexstate(p->state)); /* Update our volume, if we have one. */ if (p->vol_sc != NULL) gv_update_vol_state(p->vol_sc); } /* Update the volume state based on its plexes. */ void gv_update_vol_state(struct gv_volume *v) { struct gv_plex *p; KASSERT(v != NULL, ("gv_update_vol_state: NULL v")); /* The volume can't be up without plexes. */ if (v->plexcount == 0) { v->state = GV_VOL_DOWN; return; } LIST_FOREACH(p, &v->plexes, in_volume) { /* One of our plexes is accessible, and so are we. */ if (p->state > GV_PLEX_DEGRADED) { v->state = GV_VOL_UP; return; /* We can handle a RAID5 plex with one dead subdisk as well. */ } else if ((p->org == GV_PLEX_RAID5) && (p->state == GV_PLEX_DEGRADED)) { v->state = GV_VOL_UP; return; } } /* Not one of our plexes is up, so we can't be either. */ v->state = GV_VOL_DOWN; } /* Return a state map for the subdisks of a plex. */ int gv_sdstatemap(struct gv_plex *p) { struct gv_sd *s; int statemap; KASSERT(p != NULL, ("gv_sdstatemap: NULL p")); statemap = 0; p->sddown = 0; /* No subdisks down yet. */ LIST_FOREACH(s, &p->subdisks, in_plex) { switch (s->state) { case GV_SD_DOWN: case GV_SD_STALE: statemap |= GV_SD_DOWNSTATE; p->sddown++; /* Another unusable subdisk. */ break; case GV_SD_UP: statemap |= GV_SD_UPSTATE; break; case GV_SD_INITIALIZING: statemap |= GV_SD_INITSTATE; break; case GV_SD_REVIVING: statemap |= GV_SD_INITSTATE; p->sddown++; /* XXX: Another unusable subdisk? */ break; } } return (statemap); } Index: head/sys/geom/vinum/geom_vinum_volume.c =================================================================== --- head/sys/geom/vinum/geom_vinum_volume.c (revision 326269) +++ head/sys/geom/vinum/geom_vinum_volume.c (revision 326270) @@ -1,164 +1,166 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include void gv_volume_flush(struct gv_volume *v) { struct gv_softc *sc; struct bio *bp; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = bioq_takefirst(v->wqueue); while (bp != NULL) { gv_volume_start(sc, bp); bp = bioq_takefirst(v->wqueue); } } void gv_volume_start(struct gv_softc *sc, struct bio *bp) { struct g_geom *gp; struct gv_volume *v; struct gv_plex *p, *lp; int numwrites; gp = sc->geom; v = bp->bio_to->private; if (v == NULL || v->state != GV_VOL_UP) { g_io_deliver(bp, ENXIO); return; } switch (bp->bio_cmd) { case BIO_READ: /* * Try to find a good plex where we can send the request to, * round-robin-style. The plex either has to be up, or it's a * degraded RAID5 plex. Check if we have delayed requests. Put * this request on the delayed queue if so. This makes sure that * we don't read old values. */ if (bioq_first(v->wqueue) != NULL) { bioq_insert_tail(v->wqueue, bp); break; } lp = v->last_read_plex; if (lp == NULL) lp = LIST_FIRST(&v->plexes); p = LIST_NEXT(lp, in_volume); if (p == NULL) p = LIST_FIRST(&v->plexes); do { if (p == NULL) { p = lp; break; } if ((p->state > GV_PLEX_DEGRADED) || (p->state >= GV_PLEX_DEGRADED && p->org == GV_PLEX_RAID5)) break; p = LIST_NEXT(p, in_volume); if (p == NULL) p = LIST_FIRST(&v->plexes); } while (p != lp); if ((p == NULL) || (p->org == GV_PLEX_RAID5 && p->state < GV_PLEX_DEGRADED) || (p->org != GV_PLEX_RAID5 && p->state <= GV_PLEX_DEGRADED)) { g_io_deliver(bp, ENXIO); return; } v->last_read_plex = p; /* Hand it down to the plex logic. */ gv_plex_start(p, bp); break; case BIO_WRITE: case BIO_DELETE: /* Delay write-requests if any plex is synchronizing. */ LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { bioq_insert_tail(v->wqueue, bp); return; } } numwrites = 0; /* Give the BIO to each plex of this volume. */ LIST_FOREACH(p, &v->plexes, in_volume) { if (p->state < GV_PLEX_DEGRADED) continue; gv_plex_start(p, bp); numwrites++; } if (numwrites == 0) g_io_deliver(bp, ENXIO); break; } } void gv_bio_done(struct gv_softc *sc, struct bio *bp) { struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; s = bp->bio_caller1; KASSERT(s != NULL, ("gv_bio_done: NULL s")); p = s->plex_sc; KASSERT(p != NULL, ("gv_bio_done: NULL p")); v = p->vol_sc; KASSERT(v != NULL, ("gv_bio_done: NULL v")); switch (p->org) { case GV_PLEX_CONCAT: case GV_PLEX_STRIPED: gv_plex_normal_done(p, bp); break; case GV_PLEX_RAID5: gv_plex_raid5_done(p, bp); break; } } Index: head/sys/geom/virstor/binstream.c =================================================================== --- head/sys/geom/virstor/binstream.c (revision 326269) +++ head/sys/geom/virstor/binstream.c (revision 326270) @@ -1,185 +1,187 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ // $Id: binstream.c,v 1.1 2006/07/05 10:47:54 ivoras Exp $ #include __FBSDID("$FreeBSD$"); #include #include #include /* "Open" a binary stream for reading */ void bs_open(bin_stream_t * bs, void *data) { bs->data = (char *)data; bs->pos = 0; } /* "Reset" position in binary stream to zero */ void bs_reset(bin_stream_t * bs) { bs->pos = 0; } /* Write a zero-terminated string; return next position */ unsigned bs_write_str(bin_stream_t * bs, char *data) { int len = 0; do { *(bs->data + bs->pos + len) = *data; len++; } while (*(data++) != '\0'); bs->pos += len; return bs->pos; } /* Write an arbitrary buffer; return next position */ unsigned bs_write_buf(bin_stream_t * bs, char *data, unsigned data_size) { unsigned i; for (i = 0; i < data_size; i++) *(bs->data + bs->pos + i) = *(data + i); bs->pos += data_size; return bs->pos; } /* Write a 8bit uint; return next position. */ unsigned bs_write_u8(bin_stream_t * bs, uint8_t data) { *((uint8_t *) (bs->data + bs->pos)) = data; return ++(bs->pos); } /* Write a 16bit uint; return next position. */ unsigned bs_write_u16(bin_stream_t * bs, uint16_t data) { le16enc(bs->data + bs->pos, data); return (bs->pos += 2); } /* Write a 32bit uint; return next position. */ unsigned bs_write_u32(bin_stream_t * bs, uint32_t data) { le32enc(bs->data + bs->pos, data); return (bs->pos += 4); } /* Write a 64bit uint; return next position. */ unsigned bs_write_u64(bin_stream_t * bs, uint64_t data) { le64enc(bs->data + bs->pos, data); return (bs->pos += 8); } /* Read a 8bit uint & return it */ uint8_t bs_read_u8(bin_stream_t * bs) { uint8_t data = *((uint8_t *) (bs->data + bs->pos)); bs->pos++; return data; } /* * Read a null-terminated string from stream into a buffer; buf_size is size * of the buffer, including the final \0. Returns buf pointer or NULL if * garbage input. */ char* bs_read_str(bin_stream_t * bs, char *buf, unsigned buf_size) { unsigned len = 0; char *work_buf = buf; if (buf == NULL || buf_size < 1) return NULL; do { *work_buf = *(bs->data + bs->pos + len); } while (len++ < buf_size - 1 && *(work_buf++) != '\0'); *(buf + buf_size - 1) = '\0'; bs->pos += len; return buf; } /* Read an arbitrary buffer. */ void bs_read_buf(bin_stream_t * bs, char *buf, unsigned buf_size) { unsigned i; for (i = 0; i < buf_size; i++) *(buf + i) = *(bs->data + bs->pos + i); bs->pos += buf_size; } /* Read a 16bit uint & return it */ uint16_t bs_read_u16(bin_stream_t * bs) { uint16_t data = le16dec(bs->data + bs->pos); bs->pos += 2; return data; } /* Read a 32bit uint & return it */ uint32_t bs_read_u32(bin_stream_t * bs) { uint32_t data = le32dec(bs->data + bs->pos); bs->pos += 4; return data; } /* Read a 64bit uint & return it */ uint64_t bs_read_u64(bin_stream_t * bs) { uint64_t data = le64dec(bs->data + bs->pos); bs->pos += 8; return data; } Index: head/sys/geom/virstor/binstream.h =================================================================== --- head/sys/geom/virstor/binstream.h (revision 326269) +++ head/sys/geom/virstor/binstream.h (revision 326270) @@ -1,93 +1,95 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ // $Id: binstream.h,v 1.1 2006/07/05 10:47:54 ivoras Exp $ #ifndef _BIN_STREAM_ #define _BIN_STREAM_ #ifndef uint8_t #define uint8_t unsigned char #endif typedef struct { unsigned char *data; int pos; } bin_stream_t; /* "Open" a binary stream for reading */ void bs_open (bin_stream_t * bs, void *data); /* "Reset" position in binary stream to zero */ void bs_reset (bin_stream_t * bs); /* Write a zero-terminated string; return next position */ unsigned bs_write_str(bin_stream_t * bs, char *data); /* Write an arbitrary buffer; return next position */ unsigned bs_write_buf(bin_stream_t * bs, char *data, unsigned data_size); /* Write a 8bit uint; return next position. */ unsigned bs_write_u8(bin_stream_t * bs, uint8_t data); /* Write a 16bit uint; return next position. */ unsigned bs_write_u16(bin_stream_t * bs, uint16_t data); /* Write a 32bit uint; return next position. */ unsigned bs_write_u32(bin_stream_t * bs, uint32_t data); /* Write a 64bit uint; return next position. */ unsigned bs_write_u64(bin_stream_t * bs, uint64_t data); /* * Read a null-terminated string from stream into a buffer; buf_size is size * of the buffer, including the final \0. Returns buf pointer or NULL if * garbage input. */ char *bs_read_str(bin_stream_t * bs, char *buf, unsigned buf_size); /* Read an arbitrary buffer. */ void bs_read_buf(bin_stream_t * bs, char *buf, unsigned buf_size); /* Read a 8bit uint * return it */ uint8_t bs_read_u8(bin_stream_t * bs); /* Read a 16bit uint * return it */ uint16_t bs_read_u16(bin_stream_t * bs); /* Read a 8bit uint * return it */ uint32_t bs_read_u32(bin_stream_t * bs); /* Read a 8bit uint * return it */ uint64_t bs_read_u64(bin_stream_t * bs); #endif Index: head/sys/geom/virstor/g_virstor.c =================================================================== --- head/sys/geom/virstor/g_virstor.c (revision 326269) +++ head/sys/geom/virstor/g_virstor.c (revision 326270) @@ -1,1890 +1,1892 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Implementation notes: * - "Components" are wrappers around providers that make up the * virtual storage (i.e. a virstor has "physical" components) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(g_virstor, "GEOM virtual storage support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data"); /* GEOM class methods */ static g_init_t g_virstor_init; static g_fini_t g_virstor_fini; static g_taste_t g_virstor_taste; static g_ctl_req_t g_virstor_config; static g_ctl_destroy_geom_t g_virstor_destroy_geom; /* Declare & initialize class structure ("geom class") */ struct g_class g_virstor_class = { .name = G_VIRSTOR_CLASS_NAME, .version = G_VERSION, .init = g_virstor_init, .fini = g_virstor_fini, .taste = g_virstor_taste, .ctlreq = g_virstor_config, .destroy_geom = g_virstor_destroy_geom /* The .dumpconf and the rest are only usable for a geom instance, so * they will be set when such instance is created. */ }; /* Declare sysctl's and loader tunables */ SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor, CTLFLAG_RW, 0, "GEOM_GVIRSTOR information"); static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */ SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RWTUN, &g_virstor_debug, 0, "Debug level (2=production, 5=normal, 15=excessive)"); static u_int g_virstor_chunk_watermark = 100; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RWTUN, &g_virstor_chunk_watermark, 0, "Minimum number of free chunks before issuing administrative warning"); static u_int g_virstor_component_watermark = 1; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RWTUN, &g_virstor_component_watermark, 0, "Minimum number of free components before issuing administrative warning"); static int read_metadata(struct g_consumer *, struct g_virstor_metadata *); static void write_metadata(struct g_consumer *, struct g_virstor_metadata *); static int clear_metadata(struct g_virstor_component *); static int add_provider_to_geom(struct g_virstor_softc *, struct g_provider *, struct g_virstor_metadata *); static struct g_geom *create_virstor_geom(struct g_class *, struct g_virstor_metadata *); static void virstor_check_and_run(struct g_virstor_softc *); static u_int virstor_valid_components(struct g_virstor_softc *); static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t, boolean_t); static void remove_component(struct g_virstor_softc *, struct g_virstor_component *, boolean_t); static void bioq_dismantle(struct bio_queue_head *); static int allocate_chunk(struct g_virstor_softc *, struct g_virstor_component **, u_int *, u_int *); static void delay_destroy_consumer(void *, int); static void dump_component(struct g_virstor_component *comp); #if 0 static void dump_me(struct virstor_map_entry *me, unsigned int nr); #endif static void virstor_ctl_stop(struct gctl_req *, struct g_class *); static void virstor_ctl_add(struct gctl_req *, struct g_class *); static void virstor_ctl_remove(struct gctl_req *, struct g_class *); static struct g_virstor_softc * virstor_find_geom(const struct g_class *, const char *); static void update_metadata(struct g_virstor_softc *); static void fill_metadata(struct g_virstor_softc *, struct g_virstor_metadata *, u_int, u_int); static void g_virstor_orphan(struct g_consumer *); static int g_virstor_access(struct g_provider *, int, int, int); static void g_virstor_start(struct bio *); static void g_virstor_dumpconf(struct sbuf *, const char *, struct g_geom *, struct g_consumer *, struct g_provider *); static void g_virstor_done(struct bio *); static void invalid_call(void); /* * Initialise GEOM class (per-class callback) */ static void g_virstor_init(struct g_class *mp __unused) { /* Catch map struct size mismatch at compile time; Map entries must * fit into MAXPHYS exactly, with no wasted space. */ CTASSERT(VIRSTOR_MAP_BLOCK_ENTRIES*VIRSTOR_MAP_ENTRY_SIZE == MAXPHYS); /* Init UMA zones, TAILQ's, other global vars */ } /* * Finalise GEOM class (per-class callback) */ static void g_virstor_fini(struct g_class *mp __unused) { /* Deinit UMA zones & global vars */ } /* * Config (per-class callback) */ static void g_virstor_config(struct gctl_req *req, struct g_class *cp, char const *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "Failed to get 'version' argument"); return; } if (*version != G_VIRSTOR_VERSION) { gctl_error(req, "Userland and kernel versions out of sync"); return; } g_topology_unlock(); if (strcmp(verb, "add") == 0) virstor_ctl_add(req, cp); else if (strcmp(verb, "stop") == 0 || strcmp(verb, "destroy") == 0) virstor_ctl_stop(req, cp); else if (strcmp(verb, "remove") == 0) virstor_ctl_remove(req, cp); else gctl_error(req, "unknown verb: '%s'", verb); g_topology_lock(); } /* * "stop" verb from userland */ static void virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) { int *force, *nargs; int i; nargs = gctl_get_paraml(req, "nargs", sizeof *nargs); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments"); return; } force = gctl_get_paraml(req, "force", sizeof *force); if (force == NULL) { gctl_error(req, "Error fetching argument '%s'", "force"); return; } g_topology_lock(); for (i = 0; i < *nargs; i++) { char param[8]; const char *name; struct g_virstor_softc *sc; int error; sprintf(param, "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); g_topology_unlock(); return; } sc = virstor_find_geom(cp, name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", name); g_topology_unlock(); return; } LOG_MSG(LVL_INFO, "Stopping %s by the userland command", sc->geom->name); update_metadata(sc); if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) { LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d", sc->geom->name, error); } } g_topology_unlock(); } /* * "add" verb from userland - add new component(s) to the structure. * This will be done all at once in here, without going through the * .taste function for new components. */ static void virstor_ctl_add(struct gctl_req *req, struct g_class *cp) { /* Note: while this is going on, I/O is being done on * the g_up and g_down threads. The idea is to make changes * to softc members in a way that can atomically activate * them all at once. */ struct g_virstor_softc *sc; int *hardcode, *nargs; const char *geom_name; /* geom to add a component to */ struct g_consumer *fcp; struct g_virstor_bio_q *bq; u_int added; int error; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "Error fetching argument '%s'", "hardcode"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot add components to incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } fcp = sc->components[0].gcons; added = 0; g_topology_lock(); for (i = 1; i < *nargs; i++) { struct g_virstor_metadata md; char aname[8]; const char *prov_name; struct g_provider *pp; struct g_consumer *cp; u_int nc; u_int j; snprintf(aname, sizeof aname, "arg%d", i); prov_name = gctl_get_asciiparam(req, aname); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", aname); g_topology_unlock(); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; pp = g_provider_by_name(prov_name); if (pp == NULL) { /* This is the most common error so be verbose about it */ if (added != 0) { gctl_error(req, "Invalid provider: '%s' (added" " %u components)", prov_name, added); update_metadata(sc); } else { gctl_error(req, "Invalid provider: '%s'", prov_name); } g_topology_unlock(); return; } cp = g_new_consumer(sc->geom); if (cp == NULL) { gctl_error(req, "Cannot create consumer"); g_topology_unlock(); return; } error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach a consumer to %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } if (fcp->acr != 0 || fcp->acw != 0 || fcp->ace != 0) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { gctl_error(req, "Access request failed for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } if (fcp->provider->sectorsize != pp->sectorsize) { gctl_error(req, "Sector size doesn't fit for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, pp->name) == 0) { gctl_error(req, "Component %s already in %s", pp->name, sc->geom->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } sc->components = realloc(sc->components, sizeof(*sc->components) * (sc->n_components + 1), M_GVIRSTOR, M_WAITOK); nc = sc->n_components; sc->components[nc].gcons = cp; sc->components[nc].sc = sc; sc->components[nc].index = nc; sc->components[nc].chunk_count = cp->provider->mediasize / sc->chunk_size; sc->components[nc].chunk_next = 0; sc->components[nc].chunk_reserved = 0; if (sc->components[nc].chunk_count < 4) { gctl_error(req, "Provider too small: %s", cp->provider->name); g_destroy_consumer(cp); g_topology_unlock(); return; } fill_metadata(sc, &md, nc, *hardcode); write_metadata(cp, &md); /* The new component becomes visible when n_components is * incremented */ sc->n_components++; added++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added, sc->geom->name); /* Fire off BIOs previously queued because there wasn't any * physical space left. If the BIOs still can't be satisfied * they will again be added to the end of the queue (during * which the mutex will be recursed) */ bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK); bq->bio = NULL; mtx_lock(&sc->delayed_bio_q_mtx); /* First, insert a sentinel to the queue end, so we don't * end up in an infinite loop if there's still no free * space available. */ STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { bq = STAILQ_FIRST(&sc->delayed_bio_q); if (bq->bio != NULL) { g_virstor_start(bq->bio); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } else { STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); break; } } mtx_unlock(&sc->delayed_bio_q_mtx); } /* * Find a geom handled by the class */ static struct g_virstor_softc * virstor_find_geom(const struct g_class *cp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &cp->geom, geom) { if (strcmp(name, gp->name) == 0) return (gp->softc); } return (NULL); } /* * Update metadata on all components to reflect the current state * of these fields: * - chunk_next * - flags * - md_count * Expects things to be set up so write_metadata() can work, i.e. * the topology lock must be held. */ static void update_metadata(struct g_virstor_softc *sc) { struct g_virstor_metadata md; u_int n; if (virstor_valid_components(sc) != sc->n_components) return; /* Incomplete device */ LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s", sc->geom->name); /* Update metadata on components */ g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, sc->geom->class->name, sc->geom->name); g_topology_assert(); for (n = 0; n < sc->n_components; n++) { read_metadata(sc->components[n].gcons, &md); md.chunk_next = sc->components[n].chunk_next; md.flags = sc->components[n].flags; md.md_count = sc->n_components; write_metadata(sc->components[n].gcons, &md); } } /* * Fills metadata (struct md) from information stored in softc and the nc'th * component of virstor */ static void fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, u_int nc, u_int hardcode) { struct g_virstor_component *c; bzero(md, sizeof *md); c = &sc->components[nc]; strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic); md->md_version = G_VIRSTOR_VERSION; strncpy(md->md_name, sc->geom->name, sizeof md->md_name); md->md_id = sc->id; md->md_virsize = sc->virsize; md->md_chunk_size = sc->chunk_size; md->md_count = sc->n_components; if (hardcode) { strncpy(md->provider, c->gcons->provider->name, sizeof md->provider); } md->no = nc; md->provsize = c->gcons->provider->mediasize; md->chunk_count = c->chunk_count; md->chunk_next = c->chunk_next; md->chunk_reserved = c->chunk_reserved; md->flags = c->flags; } /* * Remove a component from virstor device. * Can only be done if the component is unallocated. */ static void virstor_ctl_remove(struct gctl_req *req, struct g_class *cp) { /* As this is executed in parallel to I/O, operations on virstor * structures must be as atomic as possible. */ struct g_virstor_softc *sc; int *nargs; const char *geom_name; u_int removed; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } removed = 0; for (i = 1; i < *nargs; i++) { char param[8]; const char *prov_name; int j, found; struct g_virstor_component *newcomp, *compbak; sprintf(param, "arg%d", i); prov_name = gctl_get_asciiparam(req, param); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", param); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; found = -1; for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, prov_name) == 0) { found = j; break; } } if (found == -1) { LOG_MSG(LVL_ERROR, "No %s component in %s", prov_name, sc->geom->name); continue; } compbak = sc->components; newcomp = malloc(sc->n_components * sizeof(*sc->components), M_GVIRSTOR, M_WAITOK | M_ZERO); bcopy(sc->components, newcomp, found * sizeof(*sc->components)); bcopy(&sc->components[found + 1], newcomp + found, found * sizeof(*sc->components)); if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) { LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be " "removed from %s", prov_name, sc->geom->name); free(newcomp, M_GVIRSTOR); /* We'll consider this non-fatal error */ continue; } /* Renumerate unallocated components */ for (j = 0; j < sc->n_components-1; j++) { if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) == 0) { sc->components[j].index = j; } } /* This is the critical section. If a component allocation * event happens while both variables are not yet set, * there will be trouble. Something will panic on encountering * NULL sc->components[x].gcomp member. * Luckily, component allocation happens very rarely and * removing components is an abnormal action in any case. */ sc->components = newcomp; sc->n_components--; /* End critical section */ g_topology_lock(); if (clear_metadata(&compbak[found]) != 0) { LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear " "metadata on %s", prov_name); } g_detach(compbak[found].gcons); g_destroy_consumer(compbak[found].gcons); g_topology_unlock(); free(compbak, M_GVIRSTOR); removed++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ g_topology_lock(); update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed, sc->geom->name); } /* * Clear metadata sector on component */ static int clear_metadata(struct g_virstor_component *comp) { char *buf; int error; LOG_MSG(LVL_INFO, "Clearing metadata on %s", comp->gcons->provider->name); g_topology_assert(); error = g_access(comp->gcons, 0, 1, 0); if (error != 0) return (error); buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR, M_WAITOK | M_ZERO); error = g_write_data(comp->gcons, comp->gcons->provider->mediasize - comp->gcons->provider->sectorsize, buf, comp->gcons->provider->sectorsize); free(buf, M_GVIRSTOR); g_access(comp->gcons, 0, -1, 0); return (error); } /* * Destroy geom forcibly. */ static int g_virstor_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp) { struct g_virstor_softc *sc; int exitval; sc = gp->softc; KASSERT(sc != NULL, ("%s: NULL sc", __func__)); exitval = 0; LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name, gp->softc); if (sc != NULL) { #ifdef INVARIANTS char *buf; int error; off_t off; int isclean, count; int n; LOG_MSG(LVL_INFO, "INVARIANTS detected"); LOG_MSG(LVL_INFO, "Verifying allocation " "table for %s", sc->geom->name); count = 0; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].flags || VIRSTOR_MAP_ALLOCATED != 0) count++; } LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks", sc->geom->name, count); n = off = count = 0; isclean = 1; if (virstor_valid_components(sc) != sc->n_components) { /* This is a incomplete virstor device (not all * components have been found) */ LOG_MSG(LVL_ERROR, "Device %s is incomplete", sc->geom->name); goto bailout; } error = g_access(sc->components[0].gcons, 1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d)", __func__, error)); /* Compare the whole on-disk allocation table with what's * currently in memory */ while (n < sc->chunk_count) { buf = g_read_data(sc->components[0].gcons, off, sc->sectorsize, &error); KASSERT(buf != NULL, ("g_read_data returned NULL (%d) " "for read at %jd", error, off)); if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) { LOG_MSG(LVL_ERROR, "ERROR in allocation table, " "entry %d, offset %jd", n, off); isclean = 0; count++; } n += sc->me_per_sector; off += sc->sectorsize; g_free(buf); } error = g_access(sc->components[0].gcons, -1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d) on exit", __func__, error)); if (isclean != 1) { LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s " "(%d sectors don't match, max %zu allocations)", sc->geom->name, count, count * sc->me_per_sector); } else { LOG_MSG(LVL_INFO, "Allocation table ok for %s", sc->geom->name); } bailout: #endif update_metadata(sc); virstor_geom_destroy(sc, FALSE, FALSE); exitval = EAGAIN; } else exitval = 0; return (exitval); } /* * Taste event (per-class callback) * Examines a provider and creates geom instances if needed */ static struct g_geom * g_virstor_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_virstor_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_virstor_softc *sc; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name); /* We need a dummy geom to attach a consumer to the given provider */ gp = g_new_geomf(mp, "virstor:taste.helper"); gp->start = (void *)invalid_call; /* XXX: hacked up so the */ gp->access = (void *)invalid_call; /* compiler doesn't complain. */ gp->orphan = (void *)invalid_call; /* I really want these to fail. */ cp = g_new_consumer(gp); g_attach(cp, pp); error = read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0) return (NULL); if (md.md_version != G_VIRSTOR_VERSION) { LOG_MSG(LVL_ERROR, "Kernel module version invalid " "to handle %s (%s) : %d should be %d", md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION); return (NULL); } if (md.provsize != pp->mediasize) return (NULL); /* If the provider name is hardcoded, use the offered provider only * if it's been offered with its proper name (the one used in * the label command). */ if (md.provider[0] != '\0' && !g_compare_names(md.provider, pp->name)) return (NULL); /* Iterate all geoms this class already knows about to see if a new * geom instance of this class needs to be created (in case the provider * is first from a (possibly) multi-consumer geom) or it just needs * to be added to an existing instance. */ sc = NULL; gp = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->geom->name) != 0) continue; if (md.md_id != sc->id) continue; break; } if (gp != NULL) { /* We found an existing geom instance; add to it */ LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); return (NULL); } } else { /* New geom instance needs to be created */ gp = create_virstor_geom(mp, &md); if (gp == NULL) { LOG_MSG(LVL_ERROR, "Error creating new instance of " "class %s: %s", mp->name, md.md_name); LOG_MSG(LVL_DEBUG, "Error creating %s at %s", md.md_name, pp->name); return (NULL); } sc = gp->softc; LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); virstor_geom_destroy(sc, TRUE, FALSE); return (NULL); } } return (gp); } /* * Destroyes consumer passed to it in arguments. Used as a callback * on g_event queue. */ static void delay_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *c = arg; KASSERT(c != NULL, ("%s: invalid consumer", __func__)); LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay", c->provider->name); g_detach(c); g_destroy_consumer(c); } /* * Remove a component (consumer) from geom instance; If it's the first * component being removed, orphan the provider to announce geom's being * dismantled */ static void remove_component(struct g_virstor_softc *sc, struct g_virstor_component *comp, boolean_t delay) { struct g_consumer *c; KASSERT(comp->gcons != NULL, ("Component with no consumer in %s", sc->geom->name)); c = comp->gcons; comp->gcons = NULL; KASSERT(c->provider != NULL, ("%s: no provider", __func__)); LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name, sc->geom->name); if (sc->provider != NULL) { LOG_MSG(LVL_INFO, "Removing provider %s", sc->provider->name); g_wither_provider(sc->provider, ENXIO); sc->provider = NULL; } if (c->acr > 0 || c->acw > 0 || c->ace > 0) g_access(c, -c->acr, -c->acw, -c->ace); if (delay) { /* Destroy consumer after it's tasted */ g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL); } else { g_detach(c); g_destroy_consumer(c); } } /* * Destroy geom - called internally * See g_virstor_destroy_geom for the other one */ static int virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force, boolean_t delay) { struct g_provider *pp; struct g_geom *gp; u_int n; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { LOG_MSG(force ? LVL_WARNING : LVL_ERROR, "Device %s is still open.", pp->name); if (!force) return (EBUSY); } for (n = 0; n < sc->n_components; n++) { if (sc->components[n].gcons != NULL) remove_component(sc, &sc->components[n], delay); } gp = sc->geom; gp->softc = NULL; KASSERT(sc->provider == NULL, ("Provider still exists for %s", gp->name)); /* XXX: This might or might not work, since we're called with * the topology lock held. Also, it might panic the kernel if * the error'd BIO is in softupdates code. */ mtx_lock(&sc->delayed_bio_q_mtx); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { struct g_virstor_bio_q *bq; bq = STAILQ_FIRST(&sc->delayed_bio_q); bq->bio->bio_error = ENOSPC; g_io_deliver(bq->bio, EIO); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } mtx_unlock(&sc->delayed_bio_q_mtx); mtx_destroy(&sc->delayed_bio_q_mtx); free(sc->map, M_GVIRSTOR); free(sc->components, M_GVIRSTOR); bzero(sc, sizeof *sc); free(sc, M_GVIRSTOR); pp = LIST_FIRST(&gp->provider); /* We only offer one provider */ if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name); g_wither_geom(gp, ENXIO); return (0); } /* * Utility function: read metadata & decode. Wants topology lock to be * held. */ static int read_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); virstor_metadata_decode(buf, md); g_free(buf); return (0); } /** * Utility function: encode & write metadata. Assumes topology lock is * held. * * There is no useful way of recovering from errors in this function, * not involving panicking the kernel. If the metadata cannot be written * the most we can do is notify the operator and hope he spots it and * replaces the broken drive. */ static void write_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; KASSERT(cp != NULL && md != NULL && cp->provider != NULL, ("Something's fishy in %s", __func__)); LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name); g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d", cp->provider->name, error); return; } pp = cp->provider; buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK); virstor_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GVIRSTOR); if (error != 0) LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s", error, cp->provider->name); } /* * Creates a new instance of this GEOM class, initialise softc */ static struct g_geom * create_virstor_geom(struct g_class *mp, struct g_virstor_metadata *md) { struct g_geom *gp; struct g_virstor_softc *sc; LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)", md->md_name, md->md_id); if (md->md_count < 1 || md->md_chunk_size < 1 || md->md_virsize < md->md_chunk_size) { /* This is bogus configuration, and probably means data is * somehow corrupted. Panic, maybe? */ LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s", md->md_name); return (NULL); } /* Check if it's already created */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) { LOG_MSG(LVL_WARNING, "Geom %s already exists", md->md_name); if (sc->id != md->md_id) { LOG_MSG(LVL_ERROR, "Some stale or invalid components " "exist for virstor device named %s. " "You will need to all stale " "components and maybe reconfigure " "the virstor device. Tune " "kern.geom.virstor.debug sysctl up " "for more information.", sc->geom->name); } return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); gp->softc = NULL; /* to circumevent races that test softc */ gp->start = g_virstor_start; gp->spoiled = g_virstor_orphan; gp->orphan = g_virstor_orphan; gp->access = g_virstor_access; gp->dumpconf = g_virstor_dumpconf; sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK | M_ZERO); sc->id = md->md_id; sc->n_components = md->md_count; sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count, M_GVIRSTOR, M_WAITOK | M_ZERO); sc->chunk_size = md->md_chunk_size; sc->virsize = md->md_virsize; STAILQ_INIT(&sc->delayed_bio_q); mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx", "gvirstor", MTX_DEF | MTX_RECURSE); sc->geom = gp; sc->provider = NULL; /* virstor_check_and_run will create it */ gp->softc = sc; LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name); return (gp); } /* * Add provider to a GEOM class instance */ static int add_provider_to_geom(struct g_virstor_softc *sc, struct g_provider *pp, struct g_virstor_metadata *md) { struct g_virstor_component *component; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; if (md->no >= sc->n_components) return (EINVAL); /* "Current" compontent */ component = &(sc->components[md->no]); if (component->gcons != NULL) return (EEXIST); gp = sc->geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { /* TODO: this can be made to work */ LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid " "sector size (%d)", pp->name, sc->geom->name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" consumer * to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } /* Bring up a new component */ cp->private = component; component->gcons = cp; component->sc = sc; component->index = md->no; component->chunk_count = md->chunk_count; component->chunk_next = md->chunk_next; component->chunk_reserved = md->chunk_reserved; component->flags = md->flags; LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name); virstor_check_and_run(sc); return (0); } /* * Check if everything's ready to create the geom provider & device entry, * create and start provider. * Called ultimately by .taste, from g_event thread */ static void virstor_check_and_run(struct g_virstor_softc *sc) { off_t off; size_t n, count; int index; int error; if (virstor_valid_components(sc) != sc->n_components) return; if (virstor_valid_components(sc) == 0) { /* This is actually a candidate for panic() */ LOG_MSG(LVL_ERROR, "No valid components for %s?", sc->provider->name); return; } sc->sectorsize = sc->components[0].gcons->provider->sectorsize; /* Initialise allocation map from the first consumer */ sc->chunk_count = sc->virsize / sc->chunk_size; if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) { LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes", sc->provider->name, sc->chunk_count * (off_t)sc->chunk_size); } sc->map_size = sc->chunk_count * sizeof *(sc->map); /* The following allocation is in order of 4MB - 8MB */ sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK); KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s", __func__, sc->map_size, sc->provider->name)); sc->map_sectors = sc->map_size / sc->sectorsize; count = 0; for (n = 0; n < sc->n_components; n++) count += sc->components[n].chunk_count; LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual " "(%zu KB chunks)", sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024); error = g_access(sc->components[0].gcons, 1, 0, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to " "read allocation map for %s", sc->components[0].gcons->provider->name, sc->geom->name); return; } /* Read in the allocation map */ LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name, sc->components[0].gcons->provider->name); off = count = n = 0; while (count < sc->map_size) { struct g_virstor_map_entry *mapbuf; size_t bs; bs = MIN(MAXPHYS, sc->map_size - count); if (bs % sc->sectorsize != 0) { /* Check for alignment errors */ bs = rounddown(bs, sc->sectorsize); if (bs == 0) break; LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned " "for %s on %s", sc->geom->name, sc->components[0].gcons->provider->name); } mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error); if (mapbuf == NULL) { free(sc->map, M_GVIRSTOR); LOG_MSG(LVL_ERROR, "Error reading allocation map " "for %s from %s (offset %ju) (error %d)", sc->geom->name, sc->components[0].gcons->provider->name, off, error); return; } bcopy(mapbuf, &sc->map[n], bs); off += bs; count += bs; n += bs / sizeof *(sc->map); g_free(mapbuf); } g_access(sc->components[0].gcons, -1, 0, 0); LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name); /* find first component with allocatable chunks */ index = -1; for (n = 0; n < sc->n_components; n++) { if (sc->components[n].chunk_next < sc->components[n].chunk_count) { index = n; break; } } if (index == -1) /* not found? set it to the last component and handle it * later */ index = sc->n_components - 1; if (index >= sc->n_components - g_virstor_component_watermark - 1) { LOG_MSG(LVL_WARNING, "Device %s running out of components " "(%d/%u: %s)", sc->geom->name, index+1, sc->n_components, sc->components[index].gcons->provider->name); } sc->curr_component = index; if (sc->components[index].chunk_next >= sc->components[index].chunk_count - g_virstor_chunk_watermark) { LOG_MSG(LVL_WARNING, "Component %s of %s is running out of free space " "(%u chunks left)", sc->components[index].gcons->provider->name, sc->geom->name, sc->components[index].chunk_count - sc->components[index].chunk_next); } sc->me_per_sector = sc->sectorsize / sizeof *(sc->map); if (sc->sectorsize % sizeof *(sc->map) != 0) { LOG_MSG(LVL_ERROR, "%s: Map entries don't fit exactly in a sector (%s)", __func__, sc->geom->name); return; } /* Recalculate allocated chunks in components & at the same time * verify map data is sane. We could trust metadata on this, but * we want to make sure. */ for (n = 0; n < sc->n_components; n++) sc->components[n].chunk_next = sc->components[n].chunk_reserved; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].provider_no >= sc->n_components || sc->map[n].provider_chunk >= sc->components[sc->map[n].provider_no].chunk_count) { LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s", __func__, (u_int)n, sc->geom->name); LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u" " provider_chunk: %u, chunk_count: %u", __func__, sc->map[n].provider_no, sc->n_components, sc->map[n].provider_chunk, sc->components[sc->map[n].provider_no].chunk_count); return; } if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED) sc->components[sc->map[n].provider_no].chunk_next++; } sc->provider = g_new_providerf(sc->geom, "virstor/%s", sc->geom->name); sc->provider->sectorsize = sc->sectorsize; sc->provider->mediasize = sc->virsize; g_error_provider(sc->provider, 0); LOG_MSG(LVL_INFO, "%s activated", sc->provider->name); LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting " "chunk %u", sc->provider->name, sc->curr_component, sc->components[sc->curr_component].chunk_next); } /* * Returns count of active providers in this geom instance */ static u_int virstor_valid_components(struct g_virstor_softc *sc) { unsigned int nc, i; nc = 0; KASSERT(sc != NULL, ("%s: softc is NULL", __func__)); KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__)); for (i = 0; i < sc->n_components; i++) if (sc->components[i].gcons != NULL) nc++; return (nc); } /* * Called when the consumer gets orphaned (?) */ static void g_virstor_orphan(struct g_consumer *cp) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; comp = cp->private; KASSERT(comp != NULL, ("%s: No component in private part of consumer", __func__)); remove_component(sc, comp, FALSE); if (virstor_valid_components(sc) == 0) virstor_geom_destroy(sc, TRUE, FALSE); } /* * Called to notify geom when it's been opened, and for what intent */ static int g_virstor_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c; struct g_virstor_softc *sc; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); sc = gp->softc; if (sc == NULL) { /* It seems that .access can be called with negative dr,dw,dx * in this case but I want to check for myself */ LOG_MSG(LVL_WARNING, "access(%d, %d, %d) for %s", dr, dw, de, pp->name); /* This should only happen when geom is withered so * allow only negative requests */ KASSERT(dr <= 0 && dw <= 0 && de <= 0, ("%s: Positive access for %s", __func__, pp->name)); if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) LOG_MSG(LVL_DEBUG, "Device %s definitely destroyed", pp->name); return (0); } /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) { de--; update_metadata(sc); /* Writes statistical information */ } error = ENXIO; LIST_FOREACH(c, &gp->consumer, consumer) { KASSERT(c != NULL, ("%s: consumer is NULL", __func__)); error = g_access(c, dr, dw, de); if (error != 0) { struct g_consumer *c2; /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) /* all eariler components fixed */ return (error); g_access(c2, -dr, -dw, -de); } } } return (error); } /* * Generate XML dump of current state */ static void g_virstor_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_virstor_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL || pp != NULL) return; if (cp != NULL) { /* For each component */ struct g_virstor_component *comp; comp = cp->private; if (comp == NULL) return; sbuf_printf(sb, "%s%u\n", indent, comp->index); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_count); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_next); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_reserved); sbuf_printf(sb, "%s%u%%\n", indent, comp->chunk_next > 0 ? 100 - ((comp->chunk_next + comp->chunk_reserved) * 100) / comp->chunk_count : 100); } else { /* For the whole thing */ u_int count, used, i; off_t size; count = used = size = 0; for (i = 0; i < sc->n_components; i++) { if (sc->components[i].gcons != NULL) { count += sc->components[i].chunk_count; used += sc->components[i].chunk_next + sc->components[i].chunk_reserved; size += sc->components[i].gcons-> provider->mediasize; } } sbuf_printf(sb, "%s" "Components=%u, Online=%u\n", indent, sc->n_components, virstor_valid_components(sc)); sbuf_printf(sb, "%s%u%% physical free\n", indent, 100-(used * 100) / count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_size); sbuf_printf(sb, "%s%u%%\n", indent, used > 0 ? 100 - (used * 100) / count : 100); sbuf_printf(sb, "%s%u\n", indent, count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_count); sbuf_printf(sb, "%s%zu%%\n", indent, (count * 100) / sc->chunk_count); sbuf_printf(sb, "%s%jd\n", indent, size); sbuf_printf(sb, "%s%jd\n", indent, sc->virsize); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_virstor_done(struct bio *b) { struct g_virstor_softc *sc; struct bio *parent_b; parent_b = b->bio_parent; sc = parent_b->bio_to->geom->softc; if (b->bio_error != 0) { LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } /* * I/O starts here * Called in g_down thread */ static void g_virstor_start(struct bio *b) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct bio *cb; struct g_provider *pp; char *addr; off_t offset, length; struct bio_queue_head bq; size_t chunk_size; /* cached for convenience */ u_int count; pp = b->bio_to; sc = pp->geom->softc; KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__, b->bio_to->error, b->bio_to->name)); LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__); switch (b->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: g_io_deliver(b, EOPNOTSUPP); return; } LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length); bioq_init(&bq); chunk_size = sc->chunk_size; addr = b->bio_data; offset = b->bio_offset; /* virtual offset and length */ length = b->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; struct virstor_map_entry *me; chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)", b->bio_cmd == BIO_READ ? "R" : "W", offset, length, chunk_index, in_chunk_offset, in_chunk_length); me = &sc->map[chunk_index]; if (b->bio_cmd == BIO_READ || b->bio_cmd == BIO_DELETE) { if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* Reads from unallocated chunks return zeroed * buffers */ if (b->bio_cmd == BIO_READ) bzero(addr, in_chunk_length); } else { comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk * (off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } } else { /* handle BIO_WRITE */ KASSERT(b->bio_cmd == BIO_WRITE, ("%s: Unknown command %d", __func__, b->bio_cmd)); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* We have a virtual chunk, represented by * the "me" entry, but it's not yet allocated * (tied to) a physical chunk. So do it now. */ struct virstor_map_entry *data_me; u_int phys_chunk, comp_no; off_t s_offset; int error; error = allocate_chunk(sc, &comp, &comp_no, &phys_chunk); if (error != 0) { /* We cannot allocate a physical chunk * to satisfy this request, so we'll * delay it to when we can... * XXX: this will prevent the fs from * being umounted! */ struct g_virstor_bio_q *biq; biq = malloc(sizeof *biq, M_GVIRSTOR, M_NOWAIT); if (biq == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } biq->bio = b; mtx_lock(&sc->delayed_bio_q_mtx); STAILQ_INSERT_TAIL(&sc->delayed_bio_q, biq, linkage); mtx_unlock(&sc->delayed_bio_q_mtx); LOG_MSG(LVL_WARNING, "Delaying BIO " "(size=%ju) until free physical " "space can be found on %s", b->bio_length, sc->provider->name); return; } LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s " "for %s", phys_chunk, comp->gcons->provider->name, sc->provider->name); me->provider_no = comp_no; me->provider_chunk = phys_chunk; me->flags |= VIRSTOR_MAP_ALLOCATED; cb = g_clone_bio(b); if (cb == NULL) { me->flags &= ~VIRSTOR_MAP_ALLOCATED; me->provider_no = 0; me->provider_chunk = 0; bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* The allocation table is stored continuously * at the start of the drive. We need to * calculate the offset of the sector that holds * this map entry both on the drive and in the * map array. * sc_offset will end up pointing to the drive * sector. */ s_offset = chunk_index * sizeof *me; s_offset = rounddown(s_offset, sc->sectorsize); /* data_me points to map entry sector * in memory (analogous to offset) */ data_me = &sc->map[rounddown(chunk_index, sc->me_per_sector)]; /* Commit sector with map entry to storage */ cb->bio_to = sc->components[0].gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = s_offset; cb->bio_data = (char *)data_me; cb->bio_length = sc->sectorsize; cb->bio_caller1 = &sc->components[0]; bioq_disksort(&bq, cb); } comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* Finally, handle the data */ cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); LOG_REQ(LVL_MOREDEBUG, cb, "Firing request"); comp = cb->bio_caller1; cb->bio_caller1 = NULL; LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju", cb->bio_offset, cb->bio_length); g_io_request(cb, comp->gcons); count++; } if (count == 0) { /* We handled everything locally */ b->bio_completed = b->bio_length; g_io_deliver(b, 0); } } /* * Allocate a chunk from a physical provider. Returns physical component, * chunk index relative to the component and the component's index. */ static int allocate_chunk(struct g_virstor_softc *sc, struct g_virstor_component **comp, u_int *comp_no_p, u_int *chunk) { u_int comp_no; KASSERT(sc->curr_component < sc->n_components, ("%s: Invalid curr_component: %u", __func__, sc->curr_component)); comp_no = sc->curr_component; *comp = &sc->components[comp_no]; dump_component(*comp); if ((*comp)->chunk_next >= (*comp)->chunk_count) { /* This component is full. Allocate next component */ if (comp_no >= sc->n_components-1) { LOG_MSG(LVL_ERROR, "All physical space allocated for %s", sc->geom->name); return (-1); } (*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT; sc->curr_component = ++comp_no; *comp = &sc->components[comp_no]; if (comp_no >= sc->n_components - g_virstor_component_watermark-1) LOG_MSG(LVL_WARNING, "Device %s running out of components " "(switching to %u/%u: %s)", sc->geom->name, comp_no+1, sc->n_components, (*comp)->gcons->provider->name); /* Take care not to overwrite reserved chunks */ if ( (*comp)->chunk_reserved > 0 && (*comp)->chunk_next < (*comp)->chunk_reserved) (*comp)->chunk_next = (*comp)->chunk_reserved; (*comp)->flags |= VIRSTOR_PROVIDER_ALLOCATED | VIRSTOR_PROVIDER_CURRENT; dump_component(*comp); *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } else { *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } return (0); } /* Dump a component */ static void dump_component(struct g_virstor_component *comp) { if (g_virstor_debug < LVL_DEBUG2) return; printf("Component %d: %s\n", comp->index, comp->gcons->provider->name); printf(" chunk_count: %u\n", comp->chunk_count); printf(" chunk_next: %u\n", comp->chunk_next); printf(" flags: %u\n", comp->flags); } #if 0 /* Dump a map entry */ static void dump_me(struct virstor_map_entry *me, unsigned int nr) { if (g_virstor_debug < LVL_DEBUG) return; printf("VIRT. CHUNK #%d: ", nr); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) printf("(unallocated)\n"); else printf("allocated at provider %u, provider_chunk %u\n", me->provider_no, me->provider_chunk); } #endif /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * The function that shouldn't be called. * When this is called, the stack is already garbled because of * argument mismatch. There's nothing to do now but panic, which is * accidentally the whole purpose of this function. * Motivation: to guard from accidentally calling geom methods when * they shouldn't be called. (see g_..._taste) */ static void invalid_call(void) { panic("invalid_call() has just been called. Something's fishy here."); } DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */ Index: head/sys/geom/virstor/g_virstor.h =================================================================== --- head/sys/geom/virstor/g_virstor.h (revision 326269) +++ head/sys/geom/virstor/g_virstor.h (revision 326270) @@ -1,135 +1,137 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_VIRSTOR_H_ #define _G_VIRSTOR_H_ #define G_VIRSTOR_CLASS_NAME "VIRSTOR" #define VIRSTOR_MAP_ALLOCATED 1 struct virstor_map_entry { uint16_t flags; uint16_t provider_no; uint32_t provider_chunk; }; #define VIRSTOR_MAP_ENTRY_SIZE (sizeof(struct virstor_map_entry)) #define VIRSTOR_MAP_BLOCK_ENTRIES (MAXPHYS / VIRSTOR_MAP_ENTRY_SIZE) /* Struct size is guarded by CTASSERT in main source */ #ifdef _KERNEL #define LOG_MSG(lvl, ...) do { \ if (g_virstor_debug >= (lvl)) { \ printf("GEOM_" G_VIRSTOR_CLASS_NAME); \ if ((lvl) > 0) \ printf("[%u]", (lvl)); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define LOG_MESSAGE LOG_MSG #define LOG_REQ(lvl, bp, ...) do { \ if (g_virstor_debug >= (lvl)) { \ printf("GEOM_" G_VIRSTOR_CLASS_NAME); \ if ((lvl) > 0) \ printf("[%u]", (lvl)); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #define LOG_REQUEST LOG_REQ /* "critical" system announcements (e.g. "geom is up") */ #define LVL_ANNOUNCE 0 /* errors */ #define LVL_ERROR 1 /* warnings */ #define LVL_WARNING 2 /* info, noncritical for system operation (user doesn't have to see it */ #define LVL_INFO 5 /* debug info */ #define LVL_DEBUG 10 /* more debug info */ #define LVL_DEBUG2 12 /* superfluous debug info (large volumes of data) */ #define LVL_MOREDEBUG 15 /* Component data */ struct g_virstor_component { struct g_consumer *gcons; struct g_virstor_softc *sc; unsigned int index; /* Component index in array */ unsigned int chunk_count; unsigned int chunk_next; unsigned int chunk_reserved; unsigned int flags; }; /* Internal geom instance data */ struct g_virstor_softc { struct g_geom *geom; struct g_provider *provider; struct g_virstor_component *components; u_int n_components; u_int curr_component; /* Component currently used */ uint32_t id; /* Unique ID of this geom */ off_t virsize; /* Total size of virstor */ off_t sectorsize; size_t chunk_size; size_t chunk_count; /* governs map_size */ struct virstor_map_entry *map; size_t map_size; /* (in bytes) */ size_t map_sectors; /* Size of map in sectors */ size_t me_per_sector; /* # map entries in a sector */ STAILQ_HEAD(, g_virstor_bio_q) delayed_bio_q; /* Queue of delayed BIOs */ struct mtx delayed_bio_q_mtx; }; /* "delayed BIOs" Queue element */ struct g_virstor_bio_q { struct bio *bio; STAILQ_ENTRY(g_virstor_bio_q) linkage; }; #endif /* _KERNEL */ #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif #endif /* !_G_VIRSTOR_H_ */ Index: head/sys/geom/virstor/g_virstor_md.c =================================================================== --- head/sys/geom/virstor/g_virstor_md.c (revision 326269) +++ head/sys/geom/virstor/g_virstor_md.c (revision 326270) @@ -1,91 +1,93 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include /* * Encode data from g_virstor_metadata structure into a endian-independant * byte stream. */ void virstor_metadata_encode(struct g_virstor_metadata *md, unsigned char *data) { bin_stream_t bs; bs_open(&bs, data); bs_write_buf(&bs, md->md_magic, sizeof(md->md_magic)); bs_write_u32(&bs, md->md_version); bs_write_buf(&bs, md->md_name, sizeof(md->md_name)); bs_write_u64(&bs, md->md_virsize); bs_write_u32(&bs, md->md_chunk_size); bs_write_u32(&bs, md->md_id); bs_write_u16(&bs, md->md_count); bs_write_buf(&bs, md->provider, sizeof(md->provider)); bs_write_u16(&bs, md->no); bs_write_u64(&bs, md->provsize); bs_write_u32(&bs, md->chunk_count); bs_write_u32(&bs, md->chunk_next); bs_write_u16(&bs, md->chunk_reserved); bs_write_u16(&bs, md->flags); } /* * Decode data from endian-independant byte stream into g_virstor_metadata * structure. */ void virstor_metadata_decode(unsigned char *data, struct g_virstor_metadata *md) { bin_stream_t bs; bs_open(&bs, (char *)(data)); bs_read_buf(&bs, md->md_magic, sizeof(md->md_magic)); md->md_version = bs_read_u32(&bs); bs_read_buf(&bs, md->md_name, sizeof(md->md_name)); md->md_virsize = bs_read_u64(&bs); md->md_chunk_size = bs_read_u32(&bs); md->md_id = bs_read_u32(&bs); md->md_count = bs_read_u16(&bs); bs_read_buf(&bs, md->provider, sizeof(md->provider)); md->no = bs_read_u16(&bs); md->provsize = bs_read_u64(&bs); md->chunk_count = bs_read_u32(&bs); md->chunk_next = bs_read_u32(&bs); md->chunk_reserved = bs_read_u16(&bs); md->flags = bs_read_u16(&bs); } Index: head/sys/geom/virstor/g_virstor_md.h =================================================================== --- head/sys/geom/virstor/g_virstor_md.h (revision 326269) +++ head/sys/geom/virstor/g_virstor_md.h (revision 326270) @@ -1,69 +1,71 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_VIRSTOR_MD_H_ #define _G_VIRSTOR_MD_H_ /* * Metadata declaration */ #define G_VIRSTOR_MAGIC "GEOM::VIRSTOR" #define G_VIRSTOR_VERSION 1 /* flag: provider is allocated */ #define VIRSTOR_PROVIDER_ALLOCATED 1 /* flag: provider is currently being filled (usually it's the last * provider with VIRSTOR_PROVIDER_ALLOCATED flag */ #define VIRSTOR_PROVIDER_CURRENT 2 struct g_virstor_metadata { /* Data global to the virstor device */ char md_magic[16]; /* Magic value. */ uint32_t md_version; /* Version number. */ char md_name[16]; /* Device name (e.g. "mydata") */ uint32_t md_id; /* Unique ID. */ uint64_t md_virsize; /* Virtual device's size */ uint32_t md_chunk_size; /* Chunk size in bytes */ uint16_t md_count; /* Total number of providers */ /* Data local to this provider */ char provider[16]; /* Hardcoded provider name */ uint16_t no; /* Provider number/index */ uint64_t provsize; /* Provider's size */ uint32_t chunk_count; /* Number of chunks in this pr. */ uint32_t chunk_next; /* Next chunk to allocate */ uint16_t chunk_reserved; /* Count of "reserved" chunks */ uint16_t flags; /* Provider's flags */ }; void virstor_metadata_encode(struct g_virstor_metadata *md, unsigned char *data); void virstor_metadata_decode(unsigned char *data, struct g_virstor_metadata *md); #endif /* !_G_VIRSTOR_H_ */ Index: head/sys/geom/zero/g_zero.c =================================================================== --- head/sys/geom/zero/g_zero.c (revision 326269) +++ head/sys/geom/zero/g_zero.c (revision 326270) @@ -1,143 +1,145 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #define G_ZERO_CLASS_NAME "ZERO" static int g_zero_clear_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, zero, CTLFLAG_RW, 0, "GEOM_ZERO stuff"); static int g_zero_clear = 1; SYSCTL_PROC(_kern_geom_zero, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, &g_zero_clear, 0, g_zero_clear_sysctl, "I", "Clear read data buffer"); static int g_zero_byte = 0; SYSCTL_INT(_kern_geom_zero, OID_AUTO, byte, CTLFLAG_RW, &g_zero_byte, 0, "Byte (octet) value to clear the buffers with"); static struct g_provider *gpp; static int g_zero_clear_sysctl(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, &g_zero_clear, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (gpp == NULL) return (ENXIO); if (g_zero_clear) gpp->flags &= ~G_PF_ACCEPT_UNMAPPED; else gpp->flags |= G_PF_ACCEPT_UNMAPPED; return (0); } static void g_zero_start(struct bio *bp) { int error = ENXIO; switch (bp->bio_cmd) { case BIO_READ: if (g_zero_clear && (bp->bio_flags & BIO_UNMAPPED) == 0) memset(bp->bio_data, g_zero_byte, bp->bio_length); /* FALLTHROUGH */ case BIO_DELETE: case BIO_WRITE: bp->bio_completed = bp->bio_length; error = 0; break; case BIO_GETATTR: default: error = EOPNOTSUPP; break; } g_io_deliver(bp, error); } static void g_zero_init(struct g_class *mp) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); gp = g_new_geomf(mp, "gzero"); gp->start = g_zero_start; gp->access = g_std_access; gpp = pp = g_new_providerf(gp, "%s", gp->name); pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (!g_zero_clear) pp->flags |= G_PF_ACCEPT_UNMAPPED; pp->mediasize = 1152921504606846976LLU; pp->sectorsize = 512; g_error_provider(pp, 0); } static int g_zero_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_provider *pp; g_topology_assert(); if (gp == NULL) return (0); pp = LIST_FIRST(&gp->provider); if (pp == NULL) return (0); if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) return (EBUSY); gpp = NULL; g_wither_geom(gp, ENXIO); return (0); } static struct g_class g_zero_class = { .name = G_ZERO_CLASS_NAME, .version = G_VERSION, .init = g_zero_init, .destroy_geom = g_zero_destroy_geom }; DECLARE_GEOM_CLASS(g_zero_class, g_zero);