Index: stable/12/sys/geom/vinum/geom_vinum.h =================================================================== --- stable/12/sys/geom/vinum/geom_vinum.h (revision 356575) +++ stable/12/sys/geom/vinum/geom_vinum.h (revision 356576) @@ -1,184 +1,185 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_VINUM_H_ #define _GEOM_VINUM_H_ /* geom_vinum_create.c */ void gv_concat(struct g_geom *gp, struct gctl_req *); void gv_mirror(struct g_geom *gp, struct gctl_req *); void gv_stripe(struct g_geom *gp, struct gctl_req *); void gv_raid5(struct g_geom *gp, struct gctl_req *); int gv_create_drive(struct gv_softc *, struct gv_drive *); int gv_create_volume(struct gv_softc *, struct gv_volume *); int gv_create_plex(struct gv_softc *, struct gv_plex *); int gv_create_sd(struct gv_softc *, struct gv_sd *); /* geom_vinum_drive.c */ void gv_save_config(struct gv_softc *); int gv_read_header(struct g_consumer *, struct gv_hdr *); int gv_write_header(struct g_consumer *, struct gv_hdr *); /* geom_vinum_init.c */ void gv_start_obj(struct g_geom *, struct gctl_req *); int gv_start_plex(struct gv_plex *); int gv_start_vol(struct gv_volume *); /* geom_vinum_list.c */ void gv_ld(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lp(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_ls(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_lv(struct g_geom *, struct gctl_req *, struct sbuf *); void gv_list(struct g_geom *, struct gctl_req *); /* geom_vinum_move.c */ void gv_move(struct g_geom *, struct gctl_req *); int gv_move_sd(struct gv_softc *, struct gv_sd *, struct gv_drive *, int); /* geom_vinum_rename.c */ void gv_rename(struct g_geom *, struct gctl_req *); int gv_rename_drive(struct gv_softc *, struct gv_drive *, char *, int); int gv_rename_plex(struct gv_softc *, struct gv_plex *, char *, int); int gv_rename_sd(struct gv_softc *, struct gv_sd *, char *, int); int gv_rename_vol(struct gv_softc *, struct gv_volume *, char *, int); /* geom_vinum_rm.c */ void gv_remove(struct g_geom *, struct gctl_req *); int gv_resetconfig(struct gv_softc *); void gv_rm_sd(struct gv_softc *sc, struct gv_sd *s); void gv_rm_drive(struct gv_softc *, struct gv_drive *, int); void gv_rm_plex(struct gv_softc *, struct gv_plex *); void gv_rm_vol(struct gv_softc *, struct gv_volume *); /* geom_vinum_state.c */ int gv_sdstatemap(struct gv_plex *); void gv_setstate(struct g_geom *, struct gctl_req *); int gv_set_drive_state(struct gv_drive *, int, int); int gv_set_sd_state(struct gv_sd *, int, int); int gv_set_vol_state(struct gv_volume *, int, int); int gv_set_plex_state(struct gv_plex *, int, int); void gv_update_sd_state(struct gv_sd *); void gv_update_plex_state(struct gv_plex *); void gv_update_vol_state(struct gv_volume *); /* geom_vinum_subr.c */ void gv_adjust_freespace(struct gv_sd *, off_t); void gv_free_sd(struct gv_sd *); struct gv_drive *gv_find_drive(struct gv_softc *, char *); struct gv_drive *gv_find_drive_device(struct gv_softc *, char *); struct gv_plex *gv_find_plex(struct gv_softc *, char *); struct gv_sd *gv_find_sd(struct gv_softc *, char *); struct gv_volume *gv_find_vol(struct gv_softc *, char *); void gv_format_config(struct gv_softc *, struct sbuf *, int, char *); int gv_is_striped(struct gv_plex *); int gv_consumer_is_open(struct g_consumer *); int gv_provider_is_open(struct g_provider *); int gv_object_type(struct gv_softc *, char *); void gv_parse_config(struct gv_softc *, char *, struct gv_drive *); int gv_sd_to_drive(struct gv_sd *, struct gv_drive *); int gv_sd_to_plex(struct gv_sd *, struct gv_plex *); int gv_sdcount(struct gv_plex *, int); void gv_update_plex_config(struct gv_plex *); void gv_update_vol_size(struct gv_volume *, off_t); off_t gv_vol_size(struct gv_volume *); off_t gv_plex_size(struct gv_plex *); int gv_plexdown(struct gv_volume *); int gv_attach_plex(struct gv_plex *, struct gv_volume *, int); int gv_attach_sd(struct gv_sd *, struct gv_plex *, off_t, int); int gv_detach_plex(struct gv_plex *, int); int gv_detach_sd(struct gv_sd *, int); /* geom_vinum.c */ void gv_worker(void *); void gv_post_event(struct gv_softc *, int, void *, void *, intmax_t, intmax_t); void gv_worker_exit(struct gv_softc *); struct gv_event *gv_get_event(struct gv_softc *); void gv_remove_event(struct gv_softc *, struct gv_event *); +void gv_drive_done(struct gv_drive *); void gv_drive_tasted(struct gv_softc *, struct g_provider *); void gv_drive_lost(struct gv_softc *, struct gv_drive *); void gv_setup_objects(struct gv_softc *); void gv_start(struct bio *); int gv_access(struct g_provider *, int, int, int); void gv_cleanup(struct gv_softc *); /* geom_vinum_volume.c */ void gv_done(struct bio *); void gv_volume_start(struct gv_softc *, struct bio *); void gv_volume_flush(struct gv_volume *); void gv_bio_done(struct gv_softc *, struct bio *); /* geom_vinum_plex.c */ void gv_plex_start(struct gv_plex *, struct bio *); void gv_plex_raid5_done(struct gv_plex *, struct bio *); void gv_plex_normal_done(struct gv_plex *, struct bio *); int gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t); void gv_grow_complete(struct gv_plex *, struct bio *); void gv_init_request(struct gv_sd *, off_t, caddr_t, off_t); void gv_init_complete(struct gv_plex *, struct bio *); void gv_parity_request(struct gv_plex *, int, off_t); void gv_parity_complete(struct gv_plex *, struct bio *); void gv_rebuild_complete(struct gv_plex *, struct bio *); int gv_sync_request(struct gv_plex *, struct gv_plex *, off_t, off_t, int, caddr_t); int gv_sync_complete(struct gv_plex *, struct bio *); extern u_int g_vinum_debug; #define G_VINUM_DEBUG(lvl, ...) do { \ if (g_vinum_debug >= (lvl)) { \ printf("GEOM_VINUM"); \ if (g_vinum_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf("\n"); \ } \ } while (0) #define G_VINUM_LOGREQ(lvl, bp, ...) do { \ if (g_vinum_debug >= (lvl)) { \ printf("GEOM_VINUM"); \ if (g_vinum_debug > 0) \ printf("[%u]", lvl); \ printf(": "); \ printf(__VA_ARGS__); \ printf(" "); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) #endif /* !_GEOM_VINUM_H_ */ Index: stable/12/sys/geom/vinum/geom_vinum_events.c =================================================================== --- stable/12/sys/geom/vinum/geom_vinum_events.c (revision 356575) +++ stable/12/sys/geom/vinum/geom_vinum_events.c (revision 356576) @@ -1,262 +1,276 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include void gv_post_event(struct gv_softc *sc, int event, void *arg1, void *arg2, intmax_t arg3, intmax_t arg4) { struct gv_event *ev; ev = g_malloc(sizeof(*ev), M_WAITOK | M_ZERO); ev->type = event; ev->arg1 = arg1; ev->arg2 = arg2; ev->arg3 = arg3; ev->arg4 = arg4; mtx_lock(&sc->equeue_mtx); TAILQ_INSERT_TAIL(&sc->equeue, ev, events); wakeup(sc); mtx_unlock(&sc->equeue_mtx); } void gv_worker_exit(struct gv_softc *sc) { struct gv_event *ev; ev = g_malloc(sizeof(*ev), M_WAITOK | M_ZERO); ev->type = GV_EVENT_THREAD_EXIT; mtx_lock(&sc->equeue_mtx); TAILQ_INSERT_TAIL(&sc->equeue, ev, events); wakeup(sc); msleep(sc->worker, &sc->equeue_mtx, PDROP, "gv_wor", 0); } struct gv_event * gv_get_event(struct gv_softc *sc) { struct gv_event *ev; KASSERT(sc != NULL, ("NULL sc")); mtx_lock(&sc->equeue_mtx); ev = TAILQ_FIRST(&sc->equeue); mtx_unlock(&sc->equeue_mtx); return (ev); } void gv_remove_event(struct gv_softc *sc, struct gv_event *ev) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(ev != NULL, ("NULL ev")); mtx_lock(&sc->equeue_mtx); TAILQ_REMOVE(&sc->equeue, ev, events); mtx_unlock(&sc->equeue_mtx); } void gv_drive_tasted(struct gv_softc *sc, struct g_provider *pp) { struct g_geom *gp; struct g_consumer *cp; struct gv_hdr *hdr; struct gv_drive *d; char *buf; int error; hdr = NULL; buf = NULL; G_VINUM_DEBUG(2, "tasted drive on '%s'", pp->name); if ((GV_CFG_OFFSET % pp->sectorsize) != 0 || (GV_CFG_LEN % pp->sectorsize) != 0) { G_VINUM_DEBUG(0, "provider %s has unsupported sectorsize.", pp->name); return; } gp = sc->geom; g_topology_lock(); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "failed to attach to provider on taste event"); return; } if (g_access(cp, 1, 0, 0) != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "failed to access consumer on taste event"); return; } g_topology_unlock(); hdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); /* Read header and on-disk configuration. */ error = gv_read_header(cp, hdr); if (error) { G_VINUM_DEBUG(0, "failed to read header during taste"); goto failed; } /* * Setup the drive before we parse the on-disk configuration, so that * we already know about the drive then. */ d = gv_find_drive(sc, hdr->label.name); if (d == NULL) { d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); strlcpy(d->name, hdr->label.name, sizeof(d->name)); strlcpy(d->device, pp->name, sizeof(d->device)); } else if (d->flags & GV_DRIVE_REFERENCED) { strlcpy(d->device, pp->name, sizeof(d->device)); d->flags &= ~GV_DRIVE_REFERENCED; } else { G_VINUM_DEBUG(2, "drive '%s' is already known", d->name); goto failed; } /* Add the consumer and header to the new drive. */ d->consumer = cp; d->hdr = hdr; gv_create_drive(sc, d); buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, NULL); if (buf == NULL) { G_VINUM_DEBUG(0, "failed to read config during taste"); goto failed; } gv_parse_config(sc, buf, d); g_free(buf); g_topology_lock(); g_access(cp, -1, 0, 0); g_topology_unlock(); gv_setup_objects(sc); gv_set_drive_state(d, GV_DRIVE_UP, 0); return; failed: if (hdr != NULL) g_free(hdr); g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } /* + * Count completed BIOs and handle orphanization when all are done. + */ +void +gv_drive_done(struct gv_drive *d) +{ + + KASSERT(d->active >= 0, ("Negative number of BIOs (%d)", d->active)); + if (--d->active == 0 && (d->flags & GV_DRIVE_ORPHANED)) { + d->flags &= ~GV_DRIVE_ORPHANED; + gv_post_event(d->vinumconf, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); + } +} + +/* * When losing a drive (e.g. hardware failure), we cut down the consumer * attached to the underlying device and bring the drive itself to a * "referenced" state so that normal tasting could bring it up cleanly if it * possibly arrives again. */ void gv_drive_lost(struct gv_softc *sc, struct gv_drive *d) { struct g_consumer *cp; struct gv_drive *d2; struct gv_sd *s, *s2; struct gv_freelist *fl, *fl2; gv_set_drive_state(d, GV_DRIVE_DOWN, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); cp = d->consumer; if (cp != NULL) { - if (cp->nstart != cp->nend) { - G_VINUM_DEBUG(0, "dead drive '%s' has still active " + if (d->active > 0) { + G_VINUM_DEBUG(2, "dead drive '%s' has still active " "requests, unable to detach consumer", d->name); - gv_post_event(sc, GV_EVENT_DRIVE_LOST, d, NULL, 0, 0); + d->flags |= GV_DRIVE_ORPHANED; return; } g_topology_lock(); if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); } LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } d->consumer = NULL; g_free(d->hdr); d->hdr = NULL; d->flags |= GV_DRIVE_REFERENCED; snprintf(d->device, sizeof(d->device), "???"); d->size = 0; d->avail = 0; d->freelist_entries = 0; d->sdcount = 0; /* Put the subdisk in tasted mode, and remove from drive list. */ LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) { LIST_REMOVE(s, from_drive); s->flags |= GV_SD_TASTED; } /* * Don't forget that gv_is_newer wants a "real" drive at the beginning * of the list, so, just to be safe, we shuffle around. */ LIST_REMOVE(d, drive); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); gv_save_config(sc); } Index: stable/12/sys/geom/vinum/geom_vinum_plex.c =================================================================== --- stable/12/sys/geom/vinum/geom_vinum_plex.c (revision 356575) +++ stable/12/sys/geom/vinum/geom_vinum_plex.c (revision 356576) @@ -1,1050 +1,1052 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include static int gv_check_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static int gv_normal_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static void gv_plex_flush(struct gv_plex *); static int gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int); static int gv_plex_normal_request(struct gv_plex *, struct bio *, off_t, off_t, caddr_t); static void gv_post_bio(struct gv_softc *, struct bio *); void gv_plex_start(struct gv_plex *p, struct bio *bp) { struct bio *cbp; struct gv_sd *s; struct gv_raid5_packet *wp; caddr_t addr; off_t bcount, boff, len; bcount = bp->bio_length; addr = bp->bio_data; boff = bp->bio_offset; /* Walk over the whole length of the request, we might split it up. */ while (bcount > 0) { wp = NULL; /* * RAID5 plexes need special treatment, as a single request * might involve several read/write sub-requests. */ if (p->org == GV_PLEX_RAID5) { wp = gv_raid5_start(p, bp, addr, boff, bcount); if (wp == NULL) return; len = wp->length; if (TAILQ_EMPTY(&wp->bits)) g_free(wp); else if (wp->lockbase != -1) TAILQ_INSERT_TAIL(&p->packets, wp, list); /* * Requests to concatenated and striped plexes go straight * through. */ } else { len = gv_plex_normal_request(p, bp, boff, bcount, addr); } if (len < 0) return; bcount -= len; addr += len; boff += len; } /* * Fire off all sub-requests. We get the correct consumer (== drive) * to send each request to via the subdisk that was stored in * cbp->bio_caller1. */ cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { /* * RAID5 sub-requests need to come in correct order, otherwise * we trip over the parity, as it might be overwritten by * another sub-request. We abuse cbp->bio_caller2 to mark * potential overlap situations. */ if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) { /* Park the bio on the waiting queue. */ cbp->bio_pflags |= GV_BIO_ONHOLD; bioq_disksort(p->wqueue, cbp); } else { s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); } cbp = bioq_takefirst(p->bqueue); } } static int gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int growing) { struct gv_sd *s; int i, sdcount; off_t len_left, stripeend, stripeno, stripestart; switch (p->org) { case GV_PLEX_CONCAT: /* * Find the subdisk where this request starts. The subdisks in * this list must be ordered by plex_offset. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->plex_offset <= boff && s->plex_offset + s->size > boff) { *sdno = i; break; } i++; } if (s == NULL || s->drive_sc == NULL) return (GV_ERR_NOTFOUND); /* Calculate corresponding offsets on disk. */ *real_off = boff - s->plex_offset; len_left = s->size - (*real_off); KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount > len_left) ? len_left : bcount; break; case GV_PLEX_STRIPED: /* The number of the stripe where the request starts. */ stripeno = boff / p->stripesize; KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0")); /* Take growing subdisks into account when calculating. */ sdcount = gv_sdcount(p, (boff >= p->synced)); if (!(boff + bcount <= p->synced) && (p->flags & GV_PLEX_GROWING) && !growing) return (GV_ERR_ISBUSY); *sdno = stripeno % sdcount; KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0")); stripestart = (stripeno / sdcount) * p->stripesize; KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0")); stripeend = stripestart + p->stripesize; *real_off = boff - (stripeno * p->stripesize) + stripestart; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; break; default: return (GV_ERR_PLEXORG); } return (0); } /* * Prepare a normal plex request. */ static int gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff, off_t bcount, caddr_t addr) { struct gv_sd *s; struct bio *cbp; off_t real_len, real_off; int i, err, sdno; s = NULL; sdno = -1; real_len = real_off = 0; err = ENXIO; if (p == NULL || LIST_EMPTY(&p->subdisks)) goto bad; err = gv_plex_offset(p, boff, bcount, &real_off, &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW)); /* If the request was blocked, put it into wait. */ if (err == GV_ERR_ISBUSY) { bioq_disksort(p->rqueue, bp); return (-1); /* "Fail", and delay request. */ } if (err) { err = ENXIO; goto bad; } err = ENXIO; /* Find the right subdisk. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) break; i++; } /* Subdisk not found. */ if (s == NULL || s->drive_sc == NULL) goto bad; /* Now check if we can handle the request on this subdisk. */ switch (s->state) { case GV_SD_UP: /* If the subdisk is up, just continue. */ break; case GV_SD_DOWN: if (bp->bio_pflags & GV_BIO_INTERNAL) G_VINUM_DEBUG(0, "subdisk must be in the stale state in" " order to perform administrative requests"); goto bad; case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) { G_VINUM_DEBUG(0, "subdisk stale, unable to perform " "regular requests"); goto bad; } G_VINUM_DEBUG(1, "sd %s is initializing", s->name); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); break; case GV_SD_INITIALIZING: if (bp->bio_cmd == BIO_READ) goto bad; break; default: /* All other subdisk states mean it's not accessible. */ goto bad; } /* Clone the bio and adjust the offsets and sizes. */ cbp = g_clone_bio(bp); if (cbp == NULL) { err = ENOMEM; goto bad; } cbp->bio_offset = real_off + s->drive_offset; cbp->bio_length = real_len; cbp->bio_data = addr; cbp->bio_done = gv_done; cbp->bio_caller1 = s; + s->drive_sc->active++; /* Store the sub-requests now and let others issue them. */ bioq_insert_tail(p->bqueue, cbp); return (real_len); bad: G_VINUM_LOGREQ(0, bp, "plex request failed."); /* Building the sub-request failed. If internal BIO, do not deliver. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (-1); } g_io_deliver(bp, err); return (-1); } /* * Handle a completed request to a striped or concatenated plex. */ void gv_plex_normal_done(struct gv_plex *p, struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { /* Just set it to length since multiple plexes will * screw things up. */ pbp->bio_completed = pbp->bio_length; if (pbp->bio_pflags & GV_BIO_SYNCREQ) gv_sync_complete(p, pbp); else if (pbp->bio_pflags & GV_BIO_GROW) gv_grow_complete(p, pbp); else g_io_deliver(pbp, pbp->bio_error); } } /* * Handle a completed request to a RAID-5 plex. */ void gv_plex_raid5_done(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct bio *cbp, *pbp; struct gv_bioq *bq, *bq2; struct gv_raid5_packet *wp; off_t completed; int i; completed = 0; sc = p->vinumconf; wp = bp->bio_caller2; switch (bp->bio_parent->bio_cmd) { case BIO_READ: if (wp == NULL) { completed = bp->bio_completed; break; } TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); for (i = 0; i < wp->length; i++) wp->data[i] ^= bp->bio_data[i]; break; } if (TAILQ_EMPTY(&wp->bits)) { completed = wp->length; if (wp->lockbase != -1) { TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } } g_free(wp); } break; case BIO_WRITE: /* XXX can this ever happen? */ if (wp == NULL) { completed = bp->bio_completed; break; } /* Check if we need to handle parity data. */ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); cbp = wp->parity; if (cbp != NULL) { for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= bp->bio_data[i]; } break; } /* Handle parity data. */ if (TAILQ_EMPTY(&wp->bits)) { if (bp->bio_parent->bio_pflags & GV_BIO_CHECK) i = gv_check_parity(p, bp, wp); else i = gv_normal_parity(p, bp, wp); /* All of our sub-requests have finished. */ if (i) { completed = wp->length; TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } g_free(wp); } } break; } pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += completed; /* When the original request is finished, we deliver it. */ pbp->bio_inbed++; if (pbp->bio_inbed == pbp->bio_children) { /* Hand it over for checking or delivery. */ if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_CHECK)) { gv_parity_complete(p, pbp); } else if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_REBUILD)) { gv_rebuild_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_INIT) { gv_init_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_SYNCREQ) { gv_sync_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_GROW) { gv_grow_complete(p, pbp); } else { g_io_deliver(pbp, pbp->bio_error); } } /* Clean up what we allocated. */ if (bp->bio_cflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); } static int gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *pbp; struct gv_sd *s; int err, finished, i; err = 0; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { pbp = wp->parity; wp->parity = NULL; /* Check if the parity is correct. */ for (i = 0; i < wp->length; i++) { if (bp->bio_data[i] != pbp->bio_data[i]) { err = 1; break; } } /* The parity is not correct... */ if (err) { bp->bio_parent->bio_error = EAGAIN; /* ... but we rebuild it. */ if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) { s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } } /* * Clean up the BIO we would have used for rebuilding the * parity. */ if (finished) { bp->bio_parent->bio_inbed++; g_destroy_bio(pbp); } } return (finished); } static int gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *cbp, *pbp; struct gv_sd *s; int finished, i; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; cbp = wp->parity; for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= pbp->bio_data[i]; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { cbp = wp->parity; wp->parity = NULL; s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); finished = 0; } return (finished); } /* Flush the queue with delayed requests. */ static void gv_plex_flush(struct gv_plex *p) { struct gv_softc *sc; struct bio *bp; sc = p->vinumconf; bp = bioq_takefirst(p->rqueue); while (bp != NULL) { gv_plex_start(p, bp); bp = bioq_takefirst(p->rqueue); } } static void gv_post_bio(struct gv_softc *sc, struct bio *bp) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(bp != NULL, ("NULL bp")); mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(from != NULL, ("NULL from")); KASSERT(to != NULL, ("NULL to")); sc = from->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "sync from '%s' failed at offset " " %jd; out of memory", from->name, offset); return (ENOMEM); } bp->bio_length = length; - bp->bio_done = gv_done; + bp->bio_done = NULL; bp->bio_pflags |= GV_BIO_SYNCREQ; bp->bio_offset = offset; - bp->bio_caller1 = from; + bp->bio_caller1 = from; bp->bio_caller2 = to; bp->bio_cmd = type; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */ bp->bio_data = data; /* Send down next. */ gv_post_bio(sc, bp); //gv_plex_start(from, bp); return (0); } /* * Handle a finished plex sync bio. */ int gv_sync_complete(struct gv_plex *to, struct bio *bp) { struct gv_plex *from, *p; struct gv_sd *s; struct gv_volume *v; struct gv_softc *sc; off_t offset; int err; g_topology_assert_not(); err = 0; KASSERT(to != NULL, ("NULL to")); KASSERT(bp != NULL, ("NULL bp")); from = bp->bio_caller2; KASSERT(from != NULL, ("NULL from")); v = to->vol_sc; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read the next one. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); to->synced += bp->bio_length; /* If we're finished, clean up. */ if (bp->bio_offset + bp->bio_length >= from->size) { G_VINUM_DEBUG(1, "syncing of %s from %s completed", to->name, from->name); /* Update our state. */ LIST_FOREACH(s, &to->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, 0); gv_update_plex_state(to); to->flags &= ~GV_PLEX_SYNCING; to->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } else { offset = bp->bio_offset + bp->bio_length; err = gv_sync_request(from, to, offset, MIN(bp->bio_length, from->size - offset), BIO_READ, NULL); } } g_destroy_bio(bp); /* Clean up if there was an error. */ if (err) { to->flags &= ~GV_PLEX_SYNCING; G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err); } /* Check if all plexes are synced, and lower refcounts. */ g_topology_lock(); LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { g_topology_unlock(); return (-1); } } /* If we came here, all plexes are synced, and we're free. */ gv_access(v->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "plex sync completed"); gv_volume_flush(v); return (0); } /* * Create a new bio struct for the next grow request. */ int gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_grow_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_grow_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "grow of %s failed creating bio: " "out of memory", p->name); return (ENOMEM); } bp->bio_cmd = type; - bp->bio_done = gv_done; + bp->bio_done = NULL; bp->bio_error = 0; bp->bio_caller1 = p; bp->bio_offset = offset; bp->bio_length = length; bp->bio_pflags |= GV_BIO_GROW; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; bp->bio_data = data; gv_post_bio(sc, bp); //gv_plex_start(p, bp); return (0); } /* * Finish handling of a bio to a growing plex. */ void gv_grow_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; struct gv_volume *v; off_t origsize, offset; int sdcount, err; v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_complete: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_grow_complete: NULL sc")); err = 0; /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { p->synced += bp->bio_length; err = gv_grow_request(p, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read next. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); /* Find the real size of the plex. */ sdcount = gv_sdcount(p, 1); s = LIST_FIRST(&p->subdisks); KASSERT(s != NULL, ("NULL s")); origsize = (s->size * (sdcount - 1)); if (bp->bio_offset + bp->bio_length >= origsize) { G_VINUM_DEBUG(1, "growing of %s completed", p->name); p->flags &= ~GV_PLEX_GROWING; LIST_FOREACH(s, &p->subdisks, in_plex) { s->flags &= ~GV_SD_GROW; gv_set_sd_state(s, GV_SD_UP, 0); } p->size = gv_plex_size(p); gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); g_topology_lock(); gv_access(v->provider, -1, -1, 0); g_topology_unlock(); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Issue delayed requests. */ gv_plex_flush(p); } else { offset = bp->bio_offset + bp->bio_length; err = gv_grow_request(p, offset, MIN(bp->bio_length, origsize - offset), BIO_READ, NULL); } } g_destroy_bio(bp); if (err) { p->flags &= ~GV_PLEX_GROWING; G_VINUM_DEBUG(0, "error growing plex: error code %d", err); } } /* * Create an initialization BIO and send it off to the consumer. Assume that * we're given initialization data as parameter. */ void gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length) { struct gv_drive *d; struct g_consumer *cp; struct bio *bp, *cbp; KASSERT(s != NULL, ("gv_init_request: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_request: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_request: NULL cp")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } bp->bio_cmd = BIO_WRITE; bp->bio_data = data; - bp->bio_done = gv_done; + bp->bio_done = NULL; bp->bio_error = 0; bp->bio_length = length; bp->bio_pflags |= GV_BIO_INIT; bp->bio_offset = start; bp->bio_caller1 = s; /* Then ofcourse, we have to clone it. */ cbp = g_clone_bio(bp); if (cbp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } cbp->bio_done = gv_done; cbp->bio_caller1 = s; + d->active++; /* Send it off to the consumer. */ g_io_request(cbp, cp); } /* * Handle a finished initialization BIO. */ void gv_init_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_drive *d; struct g_consumer *cp; struct gv_sd *s; off_t start, length; caddr_t data; int error; s = bp->bio_caller1; start = bp->bio_offset; length = bp->bio_length; error = bp->bio_error; data = bp->bio_data; KASSERT(s != NULL, ("gv_init_complete: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_complete: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_complete: NULL cp")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_init_complete: NULL sc")); g_destroy_bio(bp); /* * First we need to find out if it was okay, and abort if it's not. * Then we need to free previous buffers, find out the correct subdisk, * as well as getting the correct starting point and length of the BIO. */ if (start >= s->drive_offset + s->size) { /* Free the data we initialized. */ if (data != NULL) g_free(data); g_topology_assert_not(); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); if (error) { gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); } else { gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); s->initialized = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); G_VINUM_DEBUG(1, "subdisk '%s' init: finished " "successfully", s->name); } return; } s->initialized += length; start += length; gv_init_request(s, start, data, length); } /* * Create a new bio struct for the next parity rebuild. Used both by internal * rebuild of degraded plexes as well as user initiated rebuilds/checks. */ void gv_parity_request(struct gv_plex *p, int flags, off_t offset) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_parity_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: " "out of memory", p->name); return; } bp->bio_cmd = BIO_WRITE; - bp->bio_done = gv_done; + bp->bio_done = NULL; bp->bio_error = 0; bp->bio_length = p->stripesize; bp->bio_caller1 = p; /* * Check if it's a rebuild of a degraded plex or a user request of * parity rebuild. */ if (flags & GV_BIO_REBUILD) bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK); else if (flags & GV_BIO_CHECK) bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO); else { G_VINUM_DEBUG(0, "invalid flags given in rebuild"); return; } bp->bio_pflags = flags; bp->bio_pflags |= GV_BIO_MALLOC; /* We still have more parity to build. */ bp->bio_offset = offset; gv_post_bio(sc, bp); //gv_plex_start(p, bp); /* Send it down to the plex. */ } /* * Handle a finished parity write. */ void gv_parity_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; int error, flags; error = bp->bio_error; flags = bp->bio_pflags; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error == EAGAIN) { G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx", (intmax_t)p->synced); } /* Any error is fatal, except EAGAIN when we're rebuilding. */ if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx " "errno %d", p->name, (intmax_t)p->synced, error); return; } else { p->synced += p->stripesize; } if (p->synced >= p->size) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); /* We're finished. */ G_VINUM_DEBUG(1, "parity operation on %s finished", p->name); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, p->synced); } /* * Handle a finished plex rebuild bio. */ void gv_rebuild_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; int error, flags; off_t offset; error = bp->bio_error; flags = bp->bio_pflags; offset = bp->bio_offset; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error) { g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d", p->name, (intmax_t)offset, error); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } offset += (p->stripesize * (gv_sdcount(p, 1) - 1)); if (offset >= p->size) { /* We're finished. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "rebuild of %s finished", p->name); gv_save_config(p->vinumconf); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; /* Try to up all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) gv_update_sd_state(s); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, offset); } Index: stable/12/sys/geom/vinum/geom_vinum_raid5.c =================================================================== --- stable/12/sys/geom/vinum/geom_vinum_raid5.c (revision 356575) +++ stable/12/sys/geom/vinum/geom_vinum_raid5.c (revision 356576) @@ -1,663 +1,667 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include static int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int *, int); static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, struct gv_raid5_packet *, caddr_t, int); static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t, int *); static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); struct gv_raid5_packet * gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct bio *cbp; struct gv_raid5_packet *wp, *wp2; struct gv_bioq *bq, *bq2; int err, delay; delay = 0; wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); wp->bio = bp; wp->waiting = NULL; wp->parity = NULL; TAILQ_INIT(&wp->bits); if (bp->bio_pflags & GV_BIO_REBUILD) err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); else if (bp->bio_pflags & GV_BIO_CHECK) err = gv_raid5_check(p, wp, bp, addr, boff, bcount); else err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); /* Means we have a delayed request. */ if (delay) { g_free(wp); return (NULL); } /* * Building the sub-request failed, we probably need to clean up a lot. */ if (err) { G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } if (wp->waiting != NULL) { if (wp->waiting->bio_cflags & GV_BIO_MALLOC) g_free(wp->waiting->bio_data); + gv_drive_done(wp->waiting->bio_caller1); g_destroy_bio(wp->waiting); } if (wp->parity != NULL) { if (wp->parity->bio_cflags & GV_BIO_MALLOC) g_free(wp->parity->bio_data); + gv_drive_done(wp->parity->bio_caller1); g_destroy_bio(wp->parity); } g_free(wp); TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { if (wp->bio != bp) continue; TAILQ_REMOVE(&p->packets, wp, list); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } g_free(wp); } cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { if (cbp->bio_cflags & GV_BIO_MALLOC) g_free(cbp->bio_data); + gv_drive_done(cbp->bio_caller1); g_destroy_bio(cbp); cbp = bioq_takefirst(p->bqueue); } /* If internal, stop and reset state. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); /* Reset flags. */ p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (NULL); } g_io_deliver(bp, err); return (NULL); } return (wp); } /* * Check if the stripe that the work packet wants is already being used by * some other work packet. */ int gv_stripe_active(struct gv_plex *p, struct bio *bp) { struct gv_raid5_packet *wp, *owp; int overlap; wp = bp->bio_caller2; if (wp->lockbase == -1) return (0); overlap = 0; TAILQ_FOREACH(owp, &p->packets, list) { if (owp == wp) break; if ((wp->lockbase >= owp->lockbase) && (wp->lockbase <= owp->lockbase + owp->length)) { overlap++; break; } if ((wp->lockbase <= owp->lockbase) && (wp->lockbase + wp->length >= owp->lockbase)) { overlap++; break; } } return (overlap); } static int gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); /* Find the right subdisk. */ parity = NULL; i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == psdno) { parity = s; break; } i++; } /* Parity stripe not found. */ if (parity == NULL) return (ENXIO); if (parity->state != GV_SD_UP) return (ENXIO); wp->length = real_len; wp->data = addr; wp->lockbase = real_off; /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the parity subdisk. */ if (s == parity) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Read the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; wp->waiting = cbp; /* * In case we want to rebuild the parity, create an extra BIO to write * it out. It also acts as buffer for the XOR operations. */ cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; return (0); } /* Rebuild a degraded RAID5 plex. */ static int gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *broken, *s; struct gv_bioq *bq; struct bio *cbp; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); /* Find the right subdisk. */ broken = NULL; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_UP) broken = s; } /* Broken stripe not found. */ if (broken == NULL) return (ENXIO); switch (broken->state) { case GV_SD_UP: return (EINVAL); case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_REBUILD)) return (ENXIO); G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); /* Set this bit now, but should be set at end. */ broken->flags |= GV_SD_CANGOUP; break; case GV_SD_REVIVING: break; default: /* All other subdisk states mean it's not accessible. */ return (ENXIO); } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; p->synced = boff; /* Post notification that we're finished. */ return (0); } /* Build a request group to perform (part of) a RAID5 request. */ static int gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) { struct g_geom *gp; struct gv_sd *broken, *original, *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno, sdno, type, grow; off_t real_len, real_off; gp = bp->bio_to->geom; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); /* We are optimistic and assume that this request will be OK. */ #define REQ_TYPE_NORMAL 0 #define REQ_TYPE_DEGRADED 1 #define REQ_TYPE_NOPARITY 2 type = REQ_TYPE_NORMAL; original = parity = broken = NULL; /* XXX: The resize won't crash with rebuild or sync, but we should still * be aware of it. Also this should perhaps be done on rebuild/check as * well? */ /* If we're over, we must use the old. */ if (boff >= p->synced) { grow = 1; /* Or if over the resized offset, we use all drives. */ } else if (boff + bcount <= p->synced) { grow = 0; /* Else, we're in the middle, and must wait a bit. */ } else { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno, grow); /* Find the right subdisks. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) original = s; if (i == psdno) parity = s; if (s->state != GV_SD_UP) broken = s; i++; } if ((original == NULL) || (parity == NULL)) return (ENXIO); /* Our data stripe is missing. */ if (original->state != GV_SD_UP) type = REQ_TYPE_DEGRADED; /* If synchronizing request, just write it if disks are stale. */ if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { type = REQ_TYPE_NORMAL; /* Our parity stripe is missing. */ } else if (parity->state != GV_SD_UP) { /* We cannot take another failure if we're already degraded. */ if (type != REQ_TYPE_NORMAL) return (ENXIO); else type = REQ_TYPE_NOPARITY; } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) type = REQ_TYPE_NORMAL; if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } switch (bp->bio_cmd) { case BIO_READ: /* * For a degraded read we need to read in all stripes except * the broken one plus the parity stripe and then recalculate * the desired data. */ if (type == REQ_TYPE_DEGRADED) { bzero(wp->data, wp->length); LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* A normal read can be fulfilled with the original subdisk. */ } else { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); } wp->lockbase = -1; break; case BIO_WRITE: /* * A degraded write means we cannot write to the original data * subdisk. Thus we need to read in all valid stripes, * recalculate the parity from the original data, and then * write the parity stripe back out. */ if (type == REQ_TYPE_DEGRADED) { /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken and the parity subdisk. */ if ((s == broken) || (s == parity)) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bcopy(addr, cbp->bio_data, wp->length); wp->parity = cbp; /* * When the parity stripe is missing we just write out the data. */ } else if (type == REQ_TYPE_NOPARITY) { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* * A normal write request goes to the original subdisk, then we * read in all other stripes, recalculate the parity and write * out the parity again. */ } else { /* Read old parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Read old data. */ cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Write new data. */ cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); /* * We must not write the new data until the old data * was read, so hold this BIO back until we're ready * for it. */ wp->waiting = cbp; /* The final bio for the parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); /* Remember that this is the BIO for the parity data. */ wp->parity = cbp; } break; default: return (EINVAL); } return (0); } /* * Calculate the offsets in the various subdisks for a RAID5 request. Also take * care of new subdisks in an expanded RAID5 array. * XXX: This assumes that the new subdisks are inserted after the others (which * is okay as long as plex_offset is larger). If subdisks are inserted into the * plexlist before, we get problems. */ static int gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int *psdno, int growing) { struct gv_sd *s; int sd, psd, sdcount; off_t len_left, stripeend, stripeoff, stripestart; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } /* The number of the subdisk containing the parity stripe. */ psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % sdcount; KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); /* Offset of the start address from the start of the stripe. */ stripeoff = boff % (p->stripesize * (sdcount - 1)); KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); /* The number of the subdisk where the stripe resides. */ sd = stripeoff / p->stripesize; KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); /* At or past parity subdisk. */ if (sd >= psd) sd++; /* The offset of the stripe on this subdisk. */ stripestart = (boff - stripeoff) / (sdcount - 1); KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); stripeoff %= p->stripesize; /* The offset of the request on this subdisk. */ *real_off = stripestart + stripeoff; stripeend = stripestart + p->stripesize; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; if (sdno != NULL) *sdno = sd; if (psdno != NULL) *psdno = psd; return (0); } static struct bio * gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, caddr_t addr, int use_wp) { struct bio *cbp; cbp = g_clone_bio(bp); if (cbp == NULL) return (NULL); if (addr == NULL) { cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); cbp->bio_cflags |= GV_BIO_MALLOC; } else cbp->bio_data = addr; cbp->bio_offset = wp->lockbase + s->drive_offset; cbp->bio_length = wp->length; cbp->bio_done = gv_done; cbp->bio_caller1 = s; + s->drive_sc->active++; if (use_wp) cbp->bio_caller2 = wp; return (cbp); } Index: stable/12/sys/geom/vinum/geom_vinum_var.h =================================================================== --- stable/12/sys/geom/vinum/geom_vinum_var.h (revision 356575) +++ stable/12/sys/geom/vinum/geom_vinum_var.h (revision 356576) @@ -1,391 +1,393 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. * Parts written by Greg Lehey. * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * * $FreeBSD$ */ #ifndef _GEOM_VINUM_VAR_H_ #define _GEOM_VINUM_VAR_H_ /* * Slice header * * Vinum drives start with this structure: * *\ Sector * |--------------------------------------| * | PDP-11 memorial boot block | 0 * |--------------------------------------| * | Disk label, maybe | 1 * |--------------------------------------| * | Slice definition (vinum_hdr) | 8 * |--------------------------------------| * | | * | Configuration info, first copy | 9 * | | * |--------------------------------------| * | | * | Configuration info, second copy | 9 + size of config * | | * |--------------------------------------| */ /* Sizes and offsets of our information. */ #define GV_HDR_OFFSET 4096 /* Offset of vinum header. */ #define GV_HDR_LEN 512 /* Size of vinum header. */ #define GV_CFG_OFFSET 4608 /* Offset of first config copy. */ #define GV_CFG_LEN 65536 /* Size of config copy. */ /* This is where the actual data starts. */ #define GV_DATA_START (GV_CFG_LEN * 2 + GV_CFG_OFFSET) /* #define GV_DATA_START (GV_CFG_LEN * 2 + GV_HDR_LEN) */ #define GV_MAXDRIVENAME 32 /* Maximum length of a device name. */ #define GV_MAXSDNAME 64 /* Maximum length of a subdisk name. */ #define GV_MAXPLEXNAME 64 /* Maximum length of a plex name. */ #define GV_MAXVOLNAME 64 /* Maximum length of a volume name. */ /* Command line flags. */ #define GV_FLAG_R 0x01 #define GV_FLAG_S 0x02 #define GV_FLAG_V 0x04 #define GV_FLAG_VV 0x08 #define GV_FLAG_F 0x10 /* Object types. */ #define GV_TYPE_VOL 1 #define GV_TYPE_PLEX 2 #define GV_TYPE_SD 3 #define GV_TYPE_DRIVE 4 /* State changing flags. */ #define GV_SETSTATE_FORCE 0x1 #define GV_SETSTATE_CONFIG 0x2 /* Subdisk state bitmaps for plexes. */ #define GV_SD_DOWNSTATE 0x01 /* Subdisk is down. */ #define GV_SD_STALESTATE 0x02 /* Subdisk is stale. */ #define GV_SD_INITSTATE 0x04 /* Subdisk is initializing. */ #define GV_SD_UPSTATE 0x08 /* Subdisk is up. */ /* Synchronization/initialization request sizes. */ #define GV_MIN_SYNCSIZE 512 #define GV_MAX_SYNCSIZE MAXPHYS #define GV_DFLT_SYNCSIZE 65536 /* Flags for BIOs, as they are processed within vinum. */ #define GV_BIO_GROW 0x01 #define GV_BIO_MALLOC 0x02 #define GV_BIO_ONHOLD 0x04 #define GV_BIO_SYNCREQ 0x08 #define GV_BIO_INIT 0x10 #define GV_BIO_REBUILD 0x20 #define GV_BIO_CHECK 0x40 #define GV_BIO_PARITY 0x80 #define GV_BIO_INTERNAL \ (GV_BIO_SYNCREQ | GV_BIO_INIT | GV_BIO_REBUILD | GV_BIO_CHECK | GV_BIO_GROW) /* Error codes to be used within gvinum. */ #define GV_ERR_SETSTATE (-1) /* Error setting state. */ #define GV_ERR_BADSIZE (-2) /* Object has wrong size. */ #define GV_ERR_INVTYPE (-3) /* Invalid object type. */ #define GV_ERR_CREATE (-4) /* Error creating gvinum object. */ #define GV_ERR_ISBUSY (-5) /* Object is busy. */ #define GV_ERR_ISATTACHED (-6) /* Object is attached to another. */ #define GV_ERR_INVFLAG (-7) /* Invalid flag passed. */ #define GV_ERR_INVSTATE (-8) /* Invalid state. */ #define GV_ERR_NOTFOUND (-9) /* Object not found. */ #define GV_ERR_NAMETAKEN (-10) /* Object name is taken. */ #define GV_ERR_NOSPACE (-11) /* No space left on drive/subdisk. */ #define GV_ERR_BADOFFSET (-12) /* Invalid offset specified. */ #define GV_ERR_INVNAME (-13) /* Invalid object name. */ #define GV_ERR_PLEXORG (-14) /* Invalid plex organization. */ /* * hostname is 256 bytes long, but we don't need to shlep multiple copies in * vinum. We use the host name just to identify this system, and 32 bytes * should be ample for that purpose. */ #define GV_HOSTNAME_LEN 32 struct gv_label { char sysname[GV_HOSTNAME_LEN]; /* System name at creation time. */ char name[GV_MAXDRIVENAME]; /* Our name of the drive. */ struct timeval date_of_birth; /* The time it was created ... */ struct timeval last_update; /* ... and the time of last update. */ off_t drive_size; /* Total size incl. headers. */ }; /* The 'header' of each valid vinum drive. */ struct gv_hdr { uint64_t magic; #define GV_OLD_MAGIC 0x494E2056494E4F00LL #define GV_OLD_NOMAGIC 0x4E4F2056494E4F00LL #define GV_MAGIC 0x56494E554D2D3100LL #define GV_NOMAGIC 0x56494E554D2D2D00LL uint64_t config_length; struct gv_label label; }; /* A single freelist entry of a drive. */ struct gv_freelist { off_t size; /* Size of this free slot. */ off_t offset; /* Offset on the drive. */ LIST_ENTRY(gv_freelist) freelist; }; /* * Since we share structures between userland and kernel, we need this helper * struct instead of struct bio_queue_head and friends. Maybe I find a proper * solution some day. */ struct gv_bioq { struct bio *bp; TAILQ_ENTRY(gv_bioq) queue; }; #define GV_EVENT_DRIVE_TASTED 1 #define GV_EVENT_DRIVE_LOST 2 #define GV_EVENT_THREAD_EXIT 3 #define GV_EVENT_CREATE_DRIVE 4 #define GV_EVENT_CREATE_VOLUME 5 #define GV_EVENT_CREATE_PLEX 6 #define GV_EVENT_CREATE_SD 7 #define GV_EVENT_SAVE_CONFIG 8 #define GV_EVENT_RM_VOLUME 9 #define GV_EVENT_RM_PLEX 10 #define GV_EVENT_RM_SD 11 #define GV_EVENT_RM_DRIVE 12 #define GV_EVENT_SET_SD_STATE 13 #define GV_EVENT_SET_DRIVE_STATE 14 #define GV_EVENT_SET_VOL_STATE 15 #define GV_EVENT_SET_PLEX_STATE 16 #define GV_EVENT_RESET_CONFIG 17 #define GV_EVENT_PARITY_REBUILD 18 #define GV_EVENT_PARITY_CHECK 19 #define GV_EVENT_START_PLEX 20 #define GV_EVENT_START_VOLUME 21 #define GV_EVENT_ATTACH_PLEX 22 #define GV_EVENT_ATTACH_SD 23 #define GV_EVENT_DETACH_PLEX 24 #define GV_EVENT_DETACH_SD 25 #define GV_EVENT_RENAME_VOL 26 #define GV_EVENT_RENAME_PLEX 27 #define GV_EVENT_RENAME_SD 28 #define GV_EVENT_RENAME_DRIVE 29 #define GV_EVENT_MOVE_SD 30 #define GV_EVENT_SETUP_OBJECTS 31 #ifdef _KERNEL struct gv_event { int type; void *arg1; void *arg2; intmax_t arg3; intmax_t arg4; TAILQ_ENTRY(gv_event) events; }; /* This struct contains the main vinum config. */ struct gv_softc { /* Linked lists of all objects in our setup. */ LIST_HEAD(,gv_drive) drives; /* All drives. */ LIST_HEAD(,gv_plex) plexes; /* All plexes. */ LIST_HEAD(,gv_sd) subdisks; /* All subdisks. */ LIST_HEAD(,gv_volume) volumes; /* All volumes. */ TAILQ_HEAD(,gv_event) equeue; /* Event queue. */ struct mtx equeue_mtx; /* Event queue lock. */ struct mtx bqueue_mtx; /* BIO queue lock. */ struct mtx config_mtx; /* Configuration lock. */ struct bio_queue_head *bqueue_down; /* BIO queue incoming requests. */ struct bio_queue_head *bqueue_up; /* BIO queue for completed requests. */ struct g_geom *geom; /* Pointer to our VINUM geom. */ struct proc *worker; /* Worker process. */ }; #endif /* softc for a drive. */ struct gv_drive { char name[GV_MAXDRIVENAME]; /* The name of this drive. */ char device[GV_MAXDRIVENAME]; /* Associated device. */ int state; /* The state of this drive. */ #define GV_DRIVE_DOWN 0 #define GV_DRIVE_UP 1 off_t size; /* Size of this drive. */ off_t avail; /* Available space. */ int sdcount; /* Number of subdisks. */ int flags; #define GV_DRIVE_REFERENCED 0x01 /* The drive isn't really existing, but was referenced by a subdisk during taste. */ +#define GV_DRIVE_ORPHANED 0x02 /* The drive was orphaned. */ struct gv_hdr *hdr; /* The drive header. */ struct g_consumer *consumer; /* Consumer attached to this drive. */ + int active; /* Number of active requests. */ int freelist_entries; /* Count of freelist entries. */ LIST_HEAD(,gv_freelist) freelist; /* List of freelist entries. */ LIST_HEAD(,gv_sd) subdisks; /* Subdisks on this drive. */ LIST_ENTRY(gv_drive) drive; /* Entry in the vinum config. */ struct gv_softc *vinumconf; /* Pointer to the vinum conf. */ }; /* softc for a subdisk. */ struct gv_sd { char name[GV_MAXSDNAME]; /* The name of this subdisk. */ off_t size; /* The size of this subdisk. */ off_t drive_offset; /* Offset in the underlying drive. */ off_t plex_offset; /* Offset in the associated plex. */ int state; /* The state of this subdisk. */ #define GV_SD_DOWN 0 #define GV_SD_STALE 1 #define GV_SD_INITIALIZING 2 #define GV_SD_REVIVING 3 #define GV_SD_UP 4 off_t initialized; /* Count of initialized bytes. */ int init_size; /* Initialization read/write size. */ int init_error; /* Flag error on initialization. */ int flags; #define GV_SD_NEWBORN 0x01 /* Subdisk is created by user. */ #define GV_SD_TASTED 0x02 /* Subdisk is created during taste. */ #define GV_SD_CANGOUP 0x04 /* Subdisk can go up immediately. */ #define GV_SD_GROW 0x08 /* Subdisk is added to striped plex. */ char drive[GV_MAXDRIVENAME]; /* Name of underlying drive. */ char plex[GV_MAXPLEXNAME]; /* Name of associated plex. */ struct gv_drive *drive_sc; /* Pointer to underlying drive. */ struct gv_plex *plex_sc; /* Pointer to associated plex. */ LIST_ENTRY(gv_sd) from_drive; /* Subdisk list of underlying drive. */ LIST_ENTRY(gv_sd) in_plex; /* Subdisk list of associated plex. */ LIST_ENTRY(gv_sd) sd; /* Entry in the vinum config. */ struct gv_softc *vinumconf; /* Pointer to the vinum config. */ }; /* softc for a plex. */ struct gv_plex { char name[GV_MAXPLEXNAME]; /* The name of the plex. */ off_t size; /* The size of the plex. */ int state; /* The plex state. */ #define GV_PLEX_DOWN 0 #define GV_PLEX_INITIALIZING 1 #define GV_PLEX_DEGRADED 2 #define GV_PLEX_GROWABLE 3 #define GV_PLEX_UP 4 int org; /* The plex organisation. */ #define GV_PLEX_DISORG 0 #define GV_PLEX_CONCAT 1 #define GV_PLEX_STRIPED 2 #define GV_PLEX_RAID5 4 int stripesize; /* The stripe size of the plex. */ char volume[GV_MAXVOLNAME]; /* Name of associated volume. */ struct gv_volume *vol_sc; /* Pointer to associated volume. */ int sddetached; /* Number of detached subdisks. */ int sdcount; /* Number of subdisks in this plex. */ int sddown; /* Number of subdisks that are down. */ int flags; #define GV_PLEX_ADDED 0x01 /* Added to an existing volume. */ #define GV_PLEX_SYNCING 0x02 /* Plex is syncing from another plex. */ #define GV_PLEX_NEWBORN 0x20 /* The plex was just created. */ #define GV_PLEX_REBUILDING 0x40 /* The plex is rebuilding. */ #define GV_PLEX_GROWING 0x80 /* The plex is growing. */ off_t synced; /* Count of synced bytes. */ TAILQ_HEAD(,gv_raid5_packet) packets; /* RAID5 sub-requests. */ LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */ LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */ LIST_ENTRY(gv_plex) plex; /* Entry in the vinum config. */ #ifdef _KERNEL struct bio_queue_head *bqueue; /* BIO queue. */ struct bio_queue_head *wqueue; /* Waiting BIO queue. */ struct bio_queue_head *rqueue; /* Rebuild waiting BIO queue. */ #else char *bpad, *wpad, *rpad; /* Padding for userland. */ #endif struct gv_softc *vinumconf; /* Pointer to the vinum config. */ }; /* softc for a volume. */ struct gv_volume { char name[GV_MAXVOLNAME]; /* The name of the volume. */ off_t size; /* The size of the volume. */ int plexcount; /* Number of plexes. */ int state; /* The state of the volume. */ #define GV_VOL_DOWN 0 #define GV_VOL_UP 1 int flags; #define GV_VOL_NEWBORN 0x08 /* The volume was just created. */ LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */ LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */ struct g_provider *provider; /* Provider of this volume. */ #ifdef _KERNEL struct bio_queue_head *wqueue; /* BIO delayed request queue. */ #else char *wpad; /* Padding for userland. */ #endif struct gv_plex *last_read_plex; struct gv_softc *vinumconf; /* Pointer to the vinum config. */ }; #endif /* !_GEOM_VINUM_VAR_H */ Index: stable/12/sys/geom/vinum/geom_vinum_volume.c =================================================================== --- stable/12/sys/geom/vinum/geom_vinum_volume.c (revision 356575) +++ stable/12/sys/geom/vinum/geom_vinum_volume.c (revision 356576) @@ -1,166 +1,168 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include void gv_volume_flush(struct gv_volume *v) { struct gv_softc *sc; struct bio *bp; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = bioq_takefirst(v->wqueue); while (bp != NULL) { gv_volume_start(sc, bp); bp = bioq_takefirst(v->wqueue); } } void gv_volume_start(struct gv_softc *sc, struct bio *bp) { struct g_geom *gp; struct gv_volume *v; struct gv_plex *p, *lp; int numwrites; gp = sc->geom; v = bp->bio_to->private; if (v == NULL || v->state != GV_VOL_UP) { g_io_deliver(bp, ENXIO); return; } switch (bp->bio_cmd) { case BIO_READ: /* * Try to find a good plex where we can send the request to, * round-robin-style. The plex either has to be up, or it's a * degraded RAID5 plex. Check if we have delayed requests. Put * this request on the delayed queue if so. This makes sure that * we don't read old values. */ if (bioq_first(v->wqueue) != NULL) { bioq_insert_tail(v->wqueue, bp); break; } lp = v->last_read_plex; if (lp == NULL) lp = LIST_FIRST(&v->plexes); p = LIST_NEXT(lp, in_volume); if (p == NULL) p = LIST_FIRST(&v->plexes); do { if (p == NULL) { p = lp; break; } if ((p->state > GV_PLEX_DEGRADED) || (p->state >= GV_PLEX_DEGRADED && p->org == GV_PLEX_RAID5)) break; p = LIST_NEXT(p, in_volume); if (p == NULL) p = LIST_FIRST(&v->plexes); } while (p != lp); if ((p == NULL) || (p->org == GV_PLEX_RAID5 && p->state < GV_PLEX_DEGRADED) || (p->org != GV_PLEX_RAID5 && p->state <= GV_PLEX_DEGRADED)) { g_io_deliver(bp, ENXIO); return; } v->last_read_plex = p; /* Hand it down to the plex logic. */ gv_plex_start(p, bp); break; case BIO_WRITE: case BIO_DELETE: /* Delay write-requests if any plex is synchronizing. */ LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { bioq_insert_tail(v->wqueue, bp); return; } } numwrites = 0; /* Give the BIO to each plex of this volume. */ LIST_FOREACH(p, &v->plexes, in_volume) { if (p->state < GV_PLEX_DEGRADED) continue; gv_plex_start(p, bp); numwrites++; } if (numwrites == 0) g_io_deliver(bp, ENXIO); break; } } void gv_bio_done(struct gv_softc *sc, struct bio *bp) { struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; s = bp->bio_caller1; KASSERT(s != NULL, ("gv_bio_done: NULL s")); p = s->plex_sc; KASSERT(p != NULL, ("gv_bio_done: NULL p")); v = p->vol_sc; KASSERT(v != NULL, ("gv_bio_done: NULL v")); switch (p->org) { case GV_PLEX_CONCAT: case GV_PLEX_STRIPED: gv_plex_normal_done(p, bp); break; case GV_PLEX_RAID5: gv_plex_raid5_done(p, bp); break; } + + gv_drive_done(s->drive_sc); } Index: stable/12 =================================================================== --- stable/12 (revision 356575) +++ stable/12 (revision 356576) Property changes on: stable/12 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r356108