Index: head/sys/geom/geom_io.c =================================================================== --- head/sys/geom/geom_io.c (revision 297954) +++ head/sys/geom/geom_io.c (revision 297955) @@ -1,1007 +1,1007 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; static struct g_bioq g_bio_run_task; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ static volatile u_int pace; static uma_zone_t biozone; /* * The head of the list of classifiers used in g_io_request. * Use g_register_classifier() and g_unregister_classifier() * to add/remove entries to the list. * Classifiers are invoked in registration order. */ static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook) g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq); #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; bp2->bio_classifier2 = bp->bio_classifier2; bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); g_bioq_init(&g_bio_run_task); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } /* * bio classification support. * * g_register_classifier() and g_unregister_classifier() * are used to add/remove a classifier from the list. * The list is protected using the g_bio_run_down lock, * because the classifiers are called in this path. * * g_io_request() passes bio's that are not already classified * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers(). * Classifiers can store their result in the two fields * bio_classifier1 and bio_classifier2. * A classifier that updates one of the fields should * return a non-zero value. * If no classifier updates the field, g_run_classifiers() sets * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls. */ int g_register_classifier(struct g_classifier_hook *hook) { g_bioq_lock(&g_bio_run_down); TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link); g_bioq_unlock(&g_bio_run_down); return (0); } void g_unregister_classifier(struct g_classifier_hook *hook) { struct g_classifier_hook *entry; g_bioq_lock(&g_bio_run_down); TAILQ_FOREACH(entry, &g_classifier_tailq, link) { if (entry == hook) { TAILQ_REMOVE(&g_classifier_tailq, hook, link); break; } } g_bioq_unlock(&g_bio_run_down); } static void g_run_classifiers(struct bio *bp) { struct g_classifier_hook *hook; int classified = 0; TAILQ_FOREACH(hook, &g_classifier_tailq, link) classified |= hook->func(hook->arg, bp); if (!classified) bp->bio_classifier1 = BIO_NOTCLASSIFIED; } void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; struct mtx *mtxp; int direct, error, first; uint8_t cmd; KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, - ("NULL bp->data in g_io_request(cmd=%hhu)", bp->bio_cmd)); + ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, - ("non-NULL bp->data in g_io_request(cmd=%hhu)", + ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) { g_bioq_lock(&g_bio_run_down); g_run_classifiers(bp); g_bioq_unlock(&g_bio_run_down); } /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ mtxp = mtx_pool_find(mtxpool_sleep, pp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_start_transaction(pp->stat, &bp->bio_t0); if (g_collectstats & G_STATS_CONSUMERS) devstat_start_transaction(cp->stat, &bp->bio_t0); pp->nstart++; cp->nstart++; mtx_unlock(mtxp); if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); cp->nend++; pp->nend++; mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void bio_taskqueue(struct bio *bp, bio_task_t *func, void *arg) { bp->bio_task = func; bp->bio_task_arg = arg; /* * The taskqueue is actually just a second queue off the "up" * queue, so we use the same lock. */ g_bioq_lock(&g_bio_run_up); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p target taskq", bp)); bp->bio_flags |= BIO_ONQUEUE; TAILQ_INSERT_TAIL(&g_bio_run_task.bio_queue, bp, bio_queue); g_bio_run_task.bio_queue_length++; wakeup(&g_wait_up); g_bioq_unlock(&g_bio_run_up); } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_task); if (bp != NULL) { g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR1(KTR_GEOM, "g_up processing task bp %p", bp); bp->bio_task(bp->bio_task_arg); THREAD_SLEEPING_OK(); continue; } bp = g_bioq_first(&g_bio_run_up); if (bp != NULL) { g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); continue; } CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void g_print_bio(struct bio *bp) { const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; printf("%s[%s]", pname, cmd); return; case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; printf("%s[%s()]", pname, cmd); return; } printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: head/sys/geom/geom_subr.c =================================================================== --- head/sys/geom/geom_subr.c (revision 297954) +++ head/sys/geom/geom_subr.c (revision 297955) @@ -1,1535 +1,1535 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KDB #include #endif struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes); static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms); char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim; struct g_hh00 { struct g_class *mp; struct g_provider *pp; off_t size; int error; int post; }; /* * This event offers a new class a chance to taste all preexisting providers. */ static void g_load_class(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp2, *mp; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name); KASSERT(mp->name != NULL && *mp->name != '\0', ("GEOM class has no name")); LIST_FOREACH(mp2, &g_classes, class) { if (mp2 == mp) { printf("The GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } else if (strcmp(mp2->name, mp->name) == 0) { printf("A GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } } LIST_INIT(&mp->geom); LIST_INSERT_HEAD(&g_classes, mp, class); if (mp->init != NULL) mp->init(mp); if (mp->taste == NULL) return; LIST_FOREACH(mp2, &g_classes, class) { if (mp == mp2) continue; LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { mp->taste(mp, pp, 0); g_topology_assert(); } } } } static int g_unload_class(struct g_class *mp) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; int error; g_topology_lock(); g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name); retry: G_VALID_CLASS(mp); LIST_FOREACH(gp, &mp->geom, geom) { /* We refuse to unload if anything is open */ LIST_FOREACH(pp, &gp->provider, provider) if (pp->acr || pp->acw || pp->ace) { g_topology_unlock(); return (EBUSY); } LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) { g_topology_unlock(); return (EBUSY); } /* If the geom is withering, wait for it to finish. */ if (gp->flags & G_GEOM_WITHER) { g_topology_sleep(mp, 1); goto retry; } } /* * We allow unloading if we have no geoms, or a class * method we can use to get rid of them. */ if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) { g_topology_unlock(); return (EOPNOTSUPP); } /* Bar new entries */ mp->taste = NULL; mp->config = NULL; LIST_FOREACH(gp, &mp->geom, geom) { error = mp->destroy_geom(NULL, mp, gp); if (error != 0) { g_topology_unlock(); return (error); } } /* Wait for withering to finish. */ for (;;) { gp = LIST_FIRST(&mp->geom); if (gp == NULL) break; KASSERT(gp->flags & G_GEOM_WITHER, ("Non-withering geom in class %s", mp->name)); g_topology_sleep(mp, 1); } G_VALID_CLASS(mp); if (mp->fini != NULL) mp->fini(mp); LIST_REMOVE(mp, class); g_topology_unlock(); return (0); } int g_modevent(module_t mod, int type, void *data) { struct g_hh00 *hh; int error; static int g_ignition; struct g_class *mp; mp = data; if (mp->version != G_VERSION) { printf("GEOM class %s has Wrong version %x\n", mp->name, mp->version); return (EINVAL); } if (!g_ignition) { g_ignition++; g_init(); } error = EOPNOTSUPP; switch (type) { case MOD_LOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; /* * Once the system is not cold, MOD_LOAD calls will be * from the userland and the g_event thread will be able * to acknowledge their completion. */ if (cold) { hh->post = 1; error = g_post_event(g_load_class, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_load_class, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } break; case MOD_UNLOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name); DROP_GIANT(); error = g_unload_class(mp); PICKUP_GIANT(); if (error == 0) { KASSERT(LIST_EMPTY(&mp->geom), ("Unloaded class (%s) still has geom", mp->name)); } break; } return (error); } static void g_retaste_event(void *arg, int flag) { struct g_class *mp, *mp2; struct g_geom *gp; struct g_hh00 *hh; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown || g_notaste) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name); LIST_FOREACH(mp2, &g_classes, class) { LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr || pp->acw || pp->ace) continue; LIST_FOREACH(cp, &pp->consumers, consumers) { if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; } if (cp != NULL) { cp->flags |= G_CF_ORPHAN; g_wither_geom(cp->geom, ENXIO); } mp->taste(mp, pp, 0); g_topology_assert(); } } } } int g_retaste(struct g_class *mp) { struct g_hh00 *hh; int error; if (mp->taste == NULL) return (EINVAL); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; if (cold) { hh->post = 1; error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } return (error); } struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) { struct g_geom *gp; va_list ap; struct sbuf *sb; g_topology_assert(); G_VALID_CLASS(mp); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO); gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO); gp->class = mp; gp->rank = 1; LIST_INIT(&gp->consumer); LIST_INIT(&gp->provider); LIST_INSERT_HEAD(&mp->geom, gp, geom); TAILQ_INSERT_HEAD(&geoms, gp, geoms); strcpy(gp->name, sbuf_data(sb)); sbuf_delete(sb); /* Fill in defaults from class */ gp->start = mp->start; gp->spoiled = mp->spoiled; gp->attrchanged = mp->attrchanged; gp->providergone = mp->providergone; gp->dumpconf = mp->dumpconf; gp->access = mp->access; gp->orphan = mp->orphan; gp->ioctl = mp->ioctl; gp->resize = mp->resize; return (gp); } void g_destroy_geom(struct g_geom *gp) { g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name); KASSERT(LIST_EMPTY(&gp->consumer), ("g_destroy_geom(%s) with consumer(s) [%p]", gp->name, LIST_FIRST(&gp->consumer))); KASSERT(LIST_EMPTY(&gp->provider), ("g_destroy_geom(%s) with provider(s) [%p]", gp->name, LIST_FIRST(&gp->provider))); g_cancel_event(gp); LIST_REMOVE(gp, geom); TAILQ_REMOVE(&geoms, gp, geoms); g_free(gp->name); g_free(gp); } /* * This function is called (repeatedly) until the geom has withered away. */ void g_wither_geom(struct g_geom *gp, int error) { struct g_provider *pp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name); if (!(gp->flags & G_GEOM_WITHER)) { gp->flags |= G_GEOM_WITHER; LIST_FOREACH(pp, &gp->provider, provider) if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } g_do_wither(); } /* * Convenience function to destroy a particular provider. */ void g_wither_provider(struct g_provider *pp, int error) { pp->flags |= G_PF_WITHER; if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } /* * This function is called (repeatedly) until the has withered away. */ void g_wither_geom_close(struct g_geom *gp, int error) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name); LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_wither_geom(gp, error); } /* * This function is called (repeatedly) until we cant wash away more * withered bits at present. */ void g_wither_washer() { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp, *pp2; struct g_consumer *cp, *cp2; g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (!(pp->flags & G_PF_WITHER)) continue; if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } if (!(gp->flags & G_GEOM_WITHER)) continue; LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) { if (cp->acr || cp->acw || cp->ace) continue; if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); } } } struct g_consumer * g_new_consumer(struct g_geom *gp) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(!(gp->flags & G_GEOM_WITHER), ("g_new_consumer on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); KASSERT(gp->orphan != NULL, ("g_new_consumer on geom(%s) (class %s) without orphan", gp->name, gp->class->name)); cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO); cp->geom = gp; cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->consumer, cp, consumer); return(cp); } void g_destroy_consumer(struct g_consumer *cp) { struct g_geom *gp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp); KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached")); KASSERT (cp->acr == 0, ("g_destroy_consumer with acr")); KASSERT (cp->acw == 0, ("g_destroy_consumer with acw")); KASSERT (cp->ace == 0, ("g_destroy_consumer with ace")); g_cancel_event(cp); gp = cp->geom; LIST_REMOVE(cp, consumer); devstat_remove_entry(cp->stat); g_free(cp); if (gp->flags & G_GEOM_WITHER) g_do_wither(); } static void g_new_provider_event(void *arg, int flag) { struct g_class *mp; struct g_provider *pp; struct g_consumer *cp, *next_cp; g_topology_assert(); if (flag == EV_CANCEL) return; if (g_shutdown) return; pp = arg; G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_new_provider_event but withered")); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { if ((cp->flags & G_CF_ORPHAN) == 0 && cp->geom->attrchanged != NULL) cp->geom->attrchanged(cp, "GEOM::media"); } if (g_notaste) return; LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) { struct g_provider *pp; struct sbuf *sb; va_list ap; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(gp->access != NULL, ("new provider on geom(%s) without ->access (class %s)", gp->name, gp->class->name)); KASSERT(gp->start != NULL, ("new provider on geom(%s) without ->start (class %s)", gp->name, gp->class->name)); KASSERT(!(gp->flags & G_GEOM_WITHER), ("new provider on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); pp->name = (char *)(pp + 1); strcpy(pp->name, sbuf_data(sb)); sbuf_delete(sb); LIST_INIT(&pp->consumers); pp->error = ENXIO; pp->geom = gp; pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->provider, pp, provider); g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL); return (pp); } void g_error_provider(struct g_provider *pp, int error) { /* G_VALID_PROVIDER(pp); We may not have g_topology */ pp->error = error; } static void g_resize_provider_event(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; off_t size; g_topology_assert(); if (g_shutdown) return; hh = arg; pp = hh->pp; size = hh->size; g_free(hh); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if (gp->resize == NULL && size < pp->mediasize) { cp->flags |= G_CF_ORPHAN; cp->geom->orphan(cp); } } pp->mediasize = size; LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if (gp->resize != NULL) gp->resize(cp); } /* * After resizing, the previously invalid GEOM class metadata * might become valid. This means we should retaste. */ LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } void g_resize_provider(struct g_provider *pp, off_t size) { struct g_hh00 *hh; G_VALID_PROVIDER(pp); if (size == pp->mediasize) return; hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->pp = pp; hh->size = size; g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL); } #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif struct g_provider * g_provider_by_name(char const *arg) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp, *wpp; if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) arg += sizeof(_PATH_DEV) - 1; wpp = NULL; LIST_FOREACH(cp, &g_classes, class) { LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (strcmp(arg, pp->name) != 0) continue; if ((gp->flags & G_GEOM_WITHER) == 0 && (pp->flags & G_PF_WITHER) == 0) return (pp); else wpp = pp; } } } return (wpp); } void g_destroy_provider(struct g_provider *pp) { struct g_geom *gp; g_topology_assert(); G_VALID_PROVIDER(pp); KASSERT(LIST_EMPTY(&pp->consumers), ("g_destroy_provider but attached")); KASSERT (pp->acr == 0, ("g_destroy_provider with acr")); KASSERT (pp->acw == 0, ("g_destroy_provider with acw")); KASSERT (pp->ace == 0, ("g_destroy_provider with ace")); g_cancel_event(pp); LIST_REMOVE(pp, provider); gp = pp->geom; devstat_remove_entry(pp->stat); /* * If a callback was provided, send notification that the provider * is now gone. */ if (gp->providergone != NULL) gp->providergone(pp); g_free(pp); if ((gp->flags & G_GEOM_WITHER)) g_do_wither(); } /* * We keep the "geoms" list sorted by topological order (== increasing * numerical rank) at all times. * When an attach is done, the attaching geoms rank is invalidated * and it is moved to the tail of the list. * All geoms later in the sequence has their ranks reevaluated in * sequence. If we cannot assign rank to a geom because it's * prerequisites do not have rank, we move that element to the tail * of the sequence with invalid rank as well. * At some point we encounter our original geom and if we stil fail * to assign it a rank, there must be a loop and we fail back to * g_attach() which detach again and calls redo_rank again * to fix up the damage. * It would be much simpler code wise to do it recursively, but we * can't risk that on the kernel stack. */ static int redo_rank(struct g_geom *gp) { struct g_consumer *cp; struct g_geom *gp1, *gp2; int n, m; g_topology_assert(); G_VALID_GEOM(gp); /* Invalidate this geoms rank and move it to the tail */ gp1 = TAILQ_NEXT(gp, geoms); if (gp1 != NULL) { gp->rank = 0; TAILQ_REMOVE(&geoms, gp, geoms); TAILQ_INSERT_TAIL(&geoms, gp, geoms); } else { gp1 = gp; } /* re-rank the rest of the sequence */ for (; gp1 != NULL; gp1 = gp2) { gp1->rank = 0; m = 1; LIST_FOREACH(cp, &gp1->consumer, consumer) { if (cp->provider == NULL) continue; n = cp->provider->geom->rank; if (n == 0) { m = 0; break; } else if (n >= m) m = n + 1; } gp1->rank = m; gp2 = TAILQ_NEXT(gp1, geoms); /* got a rank, moving on */ if (m != 0) continue; /* no rank to original geom means loop */ if (gp == gp1) return (ELOOP); /* no rank, put it at the end move on */ TAILQ_REMOVE(&geoms, gp1, geoms); TAILQ_INSERT_TAIL(&geoms, gp1, geoms); } return (0); } int g_attach(struct g_consumer *cp, struct g_provider *pp) { int error; g_topology_assert(); G_VALID_CONSUMER(cp); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp); KASSERT(cp->provider == NULL, ("attach but attached")); cp->provider = pp; LIST_INSERT_HEAD(&pp->consumers, cp, consumers); error = redo_rank(cp->geom); if (error) { LIST_REMOVE(cp, consumers); cp->provider = NULL; redo_rank(cp->geom); } return (error); } void g_detach(struct g_consumer *cp) { struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp); KASSERT(cp->provider != NULL, ("detach but not attached")); KASSERT(cp->acr == 0, ("detach but nonzero acr")); KASSERT(cp->acw == 0, ("detach but nonzero acw")); KASSERT(cp->ace == 0, ("detach but nonzero ace")); KASSERT(cp->nstart == cp->nend, ("detach with active requests")); pp = cp->provider; LIST_REMOVE(cp, consumers); cp->provider = NULL; if ((cp->geom->flags & G_GEOM_WITHER) || (pp->geom->flags & G_GEOM_WITHER) || (pp->flags & G_PF_WITHER)) g_do_wither(); redo_rank(cp->geom); } /* * g_access() * * Access-check with delta values. The question asked is "can provider * "cp" change the access counters by the relative amounts dc[rwe] ?" */ int g_access(struct g_consumer *cp, int dcr, int dcw, int dce) { struct g_provider *pp; int pr,pw,pe; int error; g_topology_assert(); G_VALID_CONSUMER(cp); pp = cp->provider; KASSERT(pp != NULL, ("access but not attached")); G_VALID_PROVIDER(pp); g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)", cp, pp->name, dcr, dcw, dce); KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr")); KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw")); KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace")); KASSERT(dcr != 0 || dcw != 0 || dce != 0, ("NOP access request")); KASSERT(pp->geom->access != NULL, ("NULL geom->access")); /* * If our class cares about being spoiled, and we have been, we * are probably just ahead of the event telling us that. Fail * now rather than having to unravel this later. */ if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) && (dcr > 0 || dcw > 0 || dce > 0)) return (ENXIO); /* * Figure out what counts the provider would have had, if this * consumer had (r0w0e0) at this time. */ pr = pp->acr - cp->acr; pw = pp->acw - cp->acw; pe = pp->ace - cp->ace; g_trace(G_T_ACCESS, "open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)", dcr, dcw, dce, cp->acr, cp->acw, cp->ace, pp->acr, pp->acw, pp->ace, pp, pp->name); /* If foot-shooting is enabled, any open on rank#1 is OK */ if ((g_debugflags & 16) && pp->geom->rank == 1) ; /* If we try exclusive but already write: fail */ else if (dce > 0 && pw > 0) return (EPERM); /* If we try write but already exclusive: fail */ else if (dcw > 0 && pe > 0) return (EPERM); /* If we try to open more but provider is error'ed: fail */ else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) return (pp->error); /* Ok then... */ error = pp->geom->access(pp, dcr, dcw, dce); KASSERT(dcr > 0 || dcw > 0 || dce > 0 || error == 0, ("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed " "closing ->access()", pp->geom->class->name, pp->name, dcr, dcw, dce, error)); if (!error) { /* * If we open first write, spoil any partner consumers. * If we close last write and provider is not errored, * trigger re-taste. */ if (pp->acw == 0 && dcw != 0) g_spoil(pp, cp); else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); pp->acr += dcr; pp->acw += dcw; pp->ace += dce; cp->acr += dcr; cp->acw += dcw; cp->ace += dce; if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) KASSERT(pp->sectorsize > 0, ("Provider %s lacks sectorsize", pp->name)); if ((cp->geom->flags & G_GEOM_WITHER) && cp->acr == 0 && cp->acw == 0 && cp->ace == 0) g_do_wither(); } return (error); } int g_handleattr_int(struct bio *bp, const char *attribute, int val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_str(struct bio *bp, const char *attribute, const char *str) { return (g_handleattr(bp, attribute, str, 0)); } int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len) { int error = 0; if (strcmp(bp->bio_attribute, attribute)) return (0); if (len == 0) { bzero(bp->bio_data, bp->bio_length); if (strlcpy(bp->bio_data, val, bp->bio_length) >= bp->bio_length) { printf("%s: %s bio_length %jd len %zu -> EFAULT\n", __func__, bp->bio_to->name, (intmax_t)bp->bio_length, strlen(val)); error = EFAULT; } } else if (bp->bio_length == len) { bcopy(val, bp->bio_data, len); } else { printf("%s: %s bio_length %jd len %d -> EFAULT\n", __func__, bp->bio_to->name, (intmax_t)bp->bio_length, len); error = EFAULT; } if (error == 0) bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); return (1); } int g_std_access(struct g_provider *pp, int dr __unused, int dw __unused, int de __unused) { g_topology_assert(); G_VALID_PROVIDER(pp); return (0); } void g_std_done(struct bio *bp) { struct bio *bp2; bp2 = bp->bio_parent; if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; g_destroy_bio(bp); bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) g_io_deliver(bp2, bp2->bio_error); } /* XXX: maybe this is only g_slice_spoiled */ void g_std_spoiled(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp); cp->flags |= G_CF_ORPHAN; g_detach(cp); gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_orphan_provider(pp, ENXIO); g_destroy_consumer(cp); if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); else gp->flags |= G_GEOM_WITHER; } /* * Spoiling happens when a provider is opened for writing, but consumers * which are configured by in-band data are attached (slicers for instance). * Since the write might potentially change the in-band data, such consumers * need to re-evaluate their existence after the writing session closes. * We do this by (offering to) tear them down when the open for write happens * in return for a re-taste when it closes again. * Together with the fact that such consumers grab an 'e' bit whenever they * are open, regardless of mode, this ends up DTRT. */ static void g_spoil_event(void *arg, int flag) { struct g_provider *pp; struct g_consumer *cp, *cp2; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp, pp->geom->class->name, pp->geom->name, pp->name); for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) { cp2 = LIST_NEXT(cp, consumers); if ((cp->flags & G_CF_SPOILED) == 0) continue; cp->flags &= ~G_CF_SPOILED; if (cp->geom->spoiled == NULL) continue; cp->geom->spoiled(cp); g_topology_assert(); } } void g_spoil(struct g_provider *pp, struct g_consumer *cp) { struct g_consumer *cp2; g_topology_assert(); G_VALID_PROVIDER(pp); G_VALID_CONSUMER(cp); LIST_FOREACH(cp2, &pp->consumers, consumers) { if (cp2 == cp) continue; /* KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr)); KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw)); */ KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace)); cp2->flags |= G_CF_SPOILED; } g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL); } static void g_media_changed_event(void *arg, int flag) { struct g_provider *pp; int retaste; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); /* * If provider was not open for writing, queue retaste after spoiling. * If it was, retaste will happen automatically on close. */ retaste = (pp->acw == 0 && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)); g_spoil_event(arg, flag); if (retaste) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); } int g_media_changed(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_media_changed_event, pp, flag, pp, NULL)); } int g_media_gone(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_spoil_event, pp, flag, pp, NULL)); } int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len) { int error, i; i = len; error = g_io_getattr(attr, cp, &i, var); if (error) return (error); if (i != len) return (EINVAL); return (0); } static int g_get_device_prefix_len(const char *name) { int len; if (strncmp(name, "ada", 3) == 0) len = 3; else if (strncmp(name, "ad", 2) == 0) len = 2; else return (0); if (name[len] < '0' || name[len] > '9') return (0); do { len++; } while (name[len] >= '0' && name[len] <= '9'); return (len); } int g_compare_names(const char *namea, const char *nameb) { int deva, devb; if (strcmp(namea, nameb) == 0) return (1); deva = g_get_device_prefix_len(namea); if (deva == 0) return (0); devb = g_get_device_prefix_len(nameb); if (devb == 0) return (0); if (strcmp(namea + deva, nameb + devb) == 0) return (1); return (0); } #if defined(DIAGNOSTIC) || defined(DDB) /* * This function walks the mesh and returns a non-zero integer if it * finds the argument pointer is an object. The return value indicates * which type of object it is believed to be. If topology is not locked, * this function is potentially dangerous, but we don't assert that the * topology lock is held when called from debugger. */ int g_valid_obj(void const *ptr) { struct g_class *mp; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; #ifdef KDB if (kdb_active == 0) #endif g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { if (ptr == mp) return (1); LIST_FOREACH(gp, &mp->geom, geom) { if (ptr == gp) return (2); LIST_FOREACH(cp, &gp->consumer, consumer) if (ptr == cp) return (3); LIST_FOREACH(pp, &gp->provider, provider) if (ptr == pp) return (4); } } return(0); } #endif #ifdef DDB #define gprintf(...) do { \ db_printf("%*s", indent, ""); \ db_printf(__VA_ARGS__); \ } while (0) #define gprintln(...) do { \ gprintf(__VA_ARGS__); \ db_printf("\n"); \ } while (0) #define ADDFLAG(obj, flag, sflag) do { \ if ((obj)->flags & (flag)) { \ if (comma) \ strlcat(str, ",", size); \ strlcat(str, (sflag), size); \ comma = 1; \ } \ } while (0) static char * provider_flags_to_string(struct g_provider *pp, char *str, size_t size) { int comma = 0; bzero(str, size); if (pp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER"); ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN"); return (str); } static char * geom_flags_to_string(struct g_geom *gp, char *str, size_t size) { int comma = 0; bzero(str, size); if (gp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER"); return (str); } static void db_show_geom_consumer(int indent, struct g_consumer *cp) { if (indent == 0) { gprintln("consumer: %p", cp); gprintln(" class: %s (%p)", cp->geom->class->name, cp->geom->class); gprintln(" geom: %s (%p)", cp->geom->name, cp->geom); if (cp->provider == NULL) gprintln(" provider: none"); else { gprintln(" provider: %s (%p)", cp->provider->name, cp->provider); } gprintln(" access: r%dw%de%d", cp->acr, cp->acw, cp->ace); gprintln(" flags: 0x%04x", cp->flags); gprintln(" nstart: %u", cp->nstart); gprintln(" nend: %u", cp->nend); } else { gprintf("consumer: %p (%s), access=r%dw%de%d", cp, cp->provider != NULL ? cp->provider->name : "none", cp->acr, cp->acw, cp->ace); if (cp->flags) db_printf(", flags=0x%04x", cp->flags); db_printf("\n"); } } static void db_show_geom_provider(int indent, struct g_provider *pp) { struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("provider: %s (%p)", pp->name, pp); gprintln(" class: %s (%p)", pp->geom->class->name, pp->geom->class); gprintln(" geom: %s (%p)", pp->geom->name, pp->geom); gprintln(" mediasize: %jd", (intmax_t)pp->mediasize); gprintln(" sectorsize: %u", pp->sectorsize); gprintln(" stripesize: %u", pp->stripesize); gprintln(" stripeoffset: %u", pp->stripeoffset); gprintln(" access: r%dw%de%d", pp->acr, pp->acw, pp->ace); gprintln(" flags: %s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); gprintln(" error: %d", pp->error); gprintln(" nstart: %u", pp->nstart); gprintln(" nend: %u", pp->nend); if (LIST_EMPTY(&pp->consumers)) gprintln(" consumers: none"); } else { gprintf("provider: %s (%p), access=r%dw%de%d", pp->name, pp, pp->acr, pp->acw, pp->ace); if (pp->flags != 0) { db_printf(", flags=%s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&pp->consumers)) { LIST_FOREACH(cp, &pp->consumers, consumers) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_geom(int indent, struct g_geom *gp) { struct g_provider *pp; struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("geom: %s (%p)", gp->name, gp); gprintln(" class: %s (%p)", gp->class->name, gp->class); gprintln(" flags: %s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); gprintln(" rank: %d", gp->rank); if (LIST_EMPTY(&gp->provider)) gprintln(" providers: none"); if (LIST_EMPTY(&gp->consumer)) gprintln(" consumers: none"); } else { gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank); if (gp->flags != 0) { db_printf(", flags=%s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&gp->provider)) { LIST_FOREACH(pp, &gp->provider, provider) { db_show_geom_provider(indent + 2, pp); if (db_pager_quit) break; } } if (!LIST_EMPTY(&gp->consumer)) { LIST_FOREACH(cp, &gp->consumer, consumer) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_class(struct g_class *mp) { struct g_geom *gp; db_printf("class: %s (%p)\n", mp->name, mp); LIST_FOREACH(gp, &mp->geom, geom) { db_show_geom_geom(2, gp); if (db_pager_quit) break; } } /* * Print the GEOM topology or the given object. */ DB_SHOW_COMMAND(geom, db_show_geom) { struct g_class *mp; if (!have_addr) { /* No address given, print the entire topology. */ LIST_FOREACH(mp, &g_classes, class) { db_show_geom_class(mp); db_printf("\n"); if (db_pager_quit) break; } } else { switch (g_valid_obj((void *)addr)) { case 1: db_show_geom_class((struct g_class *)addr); break; case 2: db_show_geom_geom(0, (struct g_geom *)addr); break; case 3: db_show_geom_consumer(0, (struct g_consumer *)addr); break; case 4: db_show_geom_provider(0, (struct g_provider *)addr); break; default: db_printf("Not a GEOM object.\n"); break; } } } static void db_print_bio_cmd(struct bio *bp) { db_printf(" cmd: "); switch (bp->bio_cmd) { case BIO_READ: db_printf("BIO_READ"); break; case BIO_WRITE: db_printf("BIO_WRITE"); break; case BIO_DELETE: db_printf("BIO_DELETE"); break; case BIO_GETATTR: db_printf("BIO_GETATTR"); break; case BIO_FLUSH: db_printf("BIO_FLUSH"); break; case BIO_CMD0: db_printf("BIO_CMD0"); break; case BIO_CMD1: db_printf("BIO_CMD1"); break; case BIO_CMD2: db_printf("BIO_CMD2"); break; default: db_printf("UNKNOWN"); break; } db_printf("\n"); } static void db_print_bio_flags(struct bio *bp) { int comma; comma = 0; db_printf(" flags: "); if (bp->bio_flags & BIO_ERROR) { db_printf("BIO_ERROR"); comma = 1; } if (bp->bio_flags & BIO_DONE) { db_printf("%sBIO_DONE", (comma ? ", " : "")); comma = 1; } if (bp->bio_flags & BIO_ONQUEUE) db_printf("%sBIO_ONQUEUE", (comma ? ", " : "")); db_printf("\n"); } /* * Print useful information in a BIO */ DB_SHOW_COMMAND(bio, db_show_bio) { struct bio *bp; if (have_addr) { bp = (struct bio *)addr; db_printf("BIO %p\n", bp); db_print_bio_cmd(bp); db_print_bio_flags(bp); - db_printf(" cflags: 0x%hhx\n", bp->bio_cflags); - db_printf(" pflags: 0x%hhx\n", bp->bio_pflags); + db_printf(" cflags: 0x%hx\n", bp->bio_cflags); + db_printf(" pflags: 0x%hx\n", bp->bio_pflags); db_printf(" offset: %jd\n", (intmax_t)bp->bio_offset); db_printf(" length: %jd\n", (intmax_t)bp->bio_length); db_printf(" bcount: %ld\n", bp->bio_bcount); db_printf(" resid: %ld\n", bp->bio_resid); db_printf(" completed: %jd\n", (intmax_t)bp->bio_completed); db_printf(" children: %u\n", bp->bio_children); db_printf(" inbed: %u\n", bp->bio_inbed); db_printf(" error: %d\n", bp->bio_error); db_printf(" parent: %p\n", bp->bio_parent); db_printf(" driver1: %p\n", bp->bio_driver1); db_printf(" driver2: %p\n", bp->bio_driver2); db_printf(" caller1: %p\n", bp->bio_caller1); db_printf(" caller2: %p\n", bp->bio_caller2); db_printf(" bio_from: %p\n", bp->bio_from); db_printf(" bio_to: %p\n", bp->bio_to); } } #undef gprintf #undef gprintln #undef ADDFLAG #endif /* DDB */ Index: head/sys/geom/mirror/g_mirror.c =================================================================== --- head/sys/geom/mirror/g_mirror.c (revision 297954) +++ head/sys/geom/mirror/g_mirror.c (revision 297955) @@ -1,3353 +1,3353 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_mirror, "GEOM mirroring support"); static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0, "GEOM_MIRROR stuff"); u_int g_mirror_debug = 0; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); static u_int g_mirror_idletime = 5; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_mirror_idletime, 0, "Mark components as clean when idling"); static u_int g_mirror_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_mirror_syncreqs = 2; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests."); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_mirror_post_sync = NULL; static int g_mirror_shutdown = 0; static int g_mirror_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_mirror_taste; static g_resize_t g_mirror_resize; static void g_mirror_init(struct g_class *mp); static void g_mirror_fini(struct g_class *mp); struct g_class g_mirror_class = { .name = G_MIRROR_CLASS_NAME, .version = G_VERSION, .ctlreq = g_mirror_config, .taste = g_mirror_taste, .destroy_geom = g_mirror_destroy_geom, .init = g_mirror_init, .fini = g_mirror_fini, .resize = g_mirror_resize }; static void g_mirror_destroy_provider(struct g_mirror_softc *sc); static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state); static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); static void g_mirror_register_request(struct bio *bp); static void g_mirror_sync_release(struct g_mirror_softc *sc); static const char * g_mirror_disk_state2str(int state) { switch (state) { case G_MIRROR_DISK_STATE_NONE: return ("NONE"); case G_MIRROR_DISK_STATE_NEW: return ("NEW"); case G_MIRROR_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_MIRROR_DISK_STATE_STALE: return ("STALE"); case G_MIRROR_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_MIRROR_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); case G_MIRROR_DISK_STATE_DESTROY: return ("DESTROY"); default: return ("INVALID"); } } static const char * g_mirror_device_state2str(int state) { switch (state) { case G_MIRROR_DEVICE_STATE_STARTING: return ("STARTING"); case G_MIRROR_DEVICE_STATE_RUNNING: return ("RUNNING"); default: return ("INVALID"); } } static const char * g_mirror_get_diskname(struct g_mirror_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } /* * --- Events handling functions --- * Events in geom_mirror are used to maintain disks and device status * from one thread to simplify locking. */ static void g_mirror_event_free(struct g_mirror_event *ep) { free(ep, M_MIRROR); } int g_mirror_event_send(void *arg, int state, int flags) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct g_mirror_event *ep; int error; ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK); G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_MIRROR_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_mirror_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_mirror_event * g_mirror_event_get(struct g_mirror_softc *sc) { struct g_mirror_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_mirror_event_cancel(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_mirror_ndisks(struct g_mirror_softc *sc, int state) { struct g_mirror_disk *disk; u_int n = 0; sx_assert(&sc->sc_lock, SX_LOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (state == -1 || disk->d_state == state) n++; } return (n); } /* * Find a disk in mirror by its disk ID. */ static struct g_mirror_disk * g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id) { struct g_mirror_disk *disk; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_id == id) return (disk); } return (NULL); } static u_int g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_mirror_nrequests(sc, cp) > 0) { G_MIRROR_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_mirror_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_mirror_is_busy(sc, cp)) return; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL); return; } G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } g_topology_unlock(); disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk)); return (0); } static void g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_mirror_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_mirror_disk * g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md, int *errorp) { struct g_mirror_disk *disk; int i, error; disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO); if (disk == NULL) { error = ENOMEM; goto fail; } disk->d_softc = sc; error = g_mirror_connect_disk(disk, pp); if (error != 0) goto fail; disk->d_id = md->md_did; disk->d_state = G_MIRROR_DISK_STATE_NONE; disk->d_priority = md->md_priority; disk->d_flags = md->md_dflags; error = g_getattr("GEOM::candelete", disk->d_consumer, &i); if (error == 0 && i != 0) disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE; if (md->md_provider[0] != '\0') disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); fail: if (errorp != NULL) *errorp = error; if (disk != NULL) free(disk, M_MIRROR); return (NULL); } static void g_mirror_destroy_disk(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); LIST_REMOVE(disk, d_next); g_mirror_event_cancel(disk); if (sc->sc_hint == disk) sc->sc_hint = NULL; switch (disk->d_state) { case G_MIRROR_DISK_STATE_SYNCHRONIZING: g_mirror_sync_stop(disk, 1); /* FALLTHROUGH */ case G_MIRROR_DISK_STATE_NEW: case G_MIRROR_DISK_STATE_STALE: case G_MIRROR_DISK_STATE_ACTIVE: g_topology_lock(); g_mirror_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); free(disk, M_MIRROR); break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } } static void g_mirror_destroy_device(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_mirror_event *ep; struct g_geom *gp; struct g_consumer *cp, *tmpcp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL; disk = LIST_FIRST(&sc->sc_disks)) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); g_mirror_destroy_disk(disk); } while ((ep = g_mirror_event_get(sc)) != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) g_mirror_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); g_topology_lock(); LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) { g_mirror_disconnect_consumer(sc, cp); } g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_done_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_mirror_orphan(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } /* * Function should return the next active disk on the list. * It is possible that it will be the same disk as given. * If there are no active disks on list, NULL is returned. */ static __inline struct g_mirror_disk * g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { struct g_mirror_disk *dp; for (dp = LIST_NEXT(disk, d_next); dp != disk; dp = LIST_NEXT(dp, d_next)) { if (dp == NULL) dp = LIST_FIRST(&sc->sc_disks); if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) return (NULL); return (dp); } static struct g_mirror_disk * g_mirror_get_disk(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; if (sc->sc_hint == NULL) { sc->sc_hint = LIST_FIRST(&sc->sc_disks); if (sc->sc_hint == NULL) return (NULL); } disk = sc->sc_hint; if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) { disk = g_mirror_find_next(sc, disk); if (disk == NULL) return (NULL); } sc->sc_hint = g_mirror_find_next(sc, disk); return (disk); } static int g_mirror_write_metadata(struct g_mirror_disk *disk, struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO); if (md != NULL && (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) { /* * Handle the case, when the size of parent provider reduced. */ if (offset < md->md_mediasize) error = ENOSPC; else mirror_metadata_encode(md, sector); } if (error == 0) error = g_write_data(cp, offset, sector, length); free(sector, M_MIRROR); if (error != 0) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } else { G_MIRROR_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_mirror_get_diskname(disk), sc->sc_name, error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } return (error); } static int g_mirror_clear_metadata(struct g_mirror_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_mirror_write_metadata(disk, NULL); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s cleared.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } return (error); } void g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk, struct g_mirror_metadata *md) { strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic)); md->md_version = G_MIRROR_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_mid = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_slice = sc->sc_slice; md->md_balance = sc->sc_balance; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK); bzero(md->md_provider, sizeof(md->md_provider)); if (disk == NULL) { md->md_did = arc4random(); md->md_priority = 0; md->md_syncid = 0; md->md_dflags = 0; md->md_sync_offset = 0; md->md_provsize = 0; } else { md->md_did = disk->d_id; md->md_priority = disk->d_priority; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = disk->d_sync.ds_offset_done; else md->md_sync_offset = 0; if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) { strlcpy(md->md_provider, disk->d_consumer->provider->name, sizeof(md->md_provider)); } md->md_provsize = disk->d_consumer->provider->mediasize; } } void g_mirror_update_metadata(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_mirror_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) g_mirror_fill_metadata(sc, disk, &md); error = g_mirror_write_metadata(disk, &md); if (error == 0) { G_MIRROR_DEBUG(2, "Metadata on %s updated.", g_mirror_get_diskname(disk)); } else { G_MIRROR_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_mirror_get_diskname(disk), error); } } static void g_mirror_bump_syncid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_mirror_update_metadata(disk); } } } static void g_mirror_bump_genid(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_mirror_update_metadata(disk); } } } static int g_mirror_idle(struct g_mirror_softc *sc, int acw) { struct g_mirror_disk *disk; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write); if (!g_mirror_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } return (0); } static void g_mirror_unidle(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; g_mirror_update_metadata(disk); } } static void g_mirror_flush_done(struct bio *bp) { struct g_mirror_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; mtx_lock(&sc->sc_done_mtx); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_done_mtx); g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_done_mtx); g_destroy_bio(bp); } static void g_mirror_done(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_regular_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; bp->bio_from->index--; if (bp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = bp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); } pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (bp->bio_error == 0 && pbp->bio_error == 0) { G_MIRROR_LOGREQ(3, bp, "Request delivered."); g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { G_MIRROR_LOGREQ(3, pbp, "Request delivered."); pbp->bio_completed = pbp->bio_length; if (pbp->bio_cmd == BIO_WRITE || pbp->bio_cmd == BIO_DELETE) { bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); } g_io_deliver(pbp, pbp->bio_error); } return; } else if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; if (disk != NULL) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN; G_MIRROR_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); } else { G_MIRROR_LOGREQ(1, bp, "Request failed (error=%d).", bp->bio_error); } if (g_mirror_disconnect_on_failure && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) { sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); } } switch (pbp->bio_cmd) { case BIO_DELETE: case BIO_WRITE: pbp->bio_inbed--; pbp->bio_children--; break; } } g_destroy_bio(bp); switch (pbp->bio_cmd) { case BIO_READ: if (pbp->bio_inbed < pbp->bio_children) break; if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1) g_io_deliver(pbp, pbp->bio_error); else { pbp->bio_error = 0; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, pbp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } break; case BIO_DELETE: case BIO_WRITE: if (pbp->bio_children == 0) { /* * All requests failed. */ } else if (pbp->bio_inbed < pbp->bio_children) { /* Do nothing. */ break; } else if (pbp->bio_children == pbp->bio_inbed) { /* Some requests succeeded. */ pbp->bio_error = 0; pbp->bio_completed = pbp->bio_length; } bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_mirror_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; default: KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd)); break; } } static void g_mirror_sync_done(struct bio *bp) { struct g_mirror_softc *sc; G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); } static void g_mirror_candelete(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; int *val; sc = bp->bio_to->geom->softc; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) break; } val = (int *)bp->bio_data; *val = (disk != NULL); g_io_deliver(bp, 0); } static void g_mirror_kernel_dump(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; struct bio *cbp; struct g_kerneldump *gkd; /* * We configure dumping to the first component, because this component * will be used for reading with 'prefer' balance algorithm. * If the component with the higest priority is currently disconnected * we will not be able to read the dump after the reboot if it will be * connected and synchronized later. Can we do something better? */ sc = bp->bio_to->geom->softc; disk = LIST_FIRST(&sc->sc_disks); gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->length > bp->bio_to->mediasize) gkd->length = bp->bio_to->mediasize; cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_std_done; g_io_request(cbp, disk->d_consumer); G_MIRROR_DEBUG(1, "Kernel dump will go to %s.", g_mirror_get_diskname(disk)); } static void g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_flush_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_mirror_start(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_mirror_start() should not be called at all. */ KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Provider's error should be set (error=%d)(mirror=%s).", bp->bio_to->error, bp->bio_to->name)); G_MIRROR_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_mirror_flush(sc, bp); return; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) { g_mirror_candelete(bp); return; } else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) { g_mirror_kernel_dump(bp); return; } /* FALLTHROUGH */ default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; int i; if (sc->sc_sync.ds_ndisks == 0) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING) continue; for (i = 0; i < g_mirror_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; if (rend > sstart && rstart < send) return (1); } } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_sync.ds_ndisks == 0) return (0); sstart = sbp->bio_offset; send = sbp->bio_offset + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp) { G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_mirror_regular_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_mirror_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_mirror_sync_release(struct g_mirror_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_mirror_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_MIRROR_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * beeing synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_mirror_sync_request(struct bio *bp) { struct g_mirror_softc *sc; struct g_mirror_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request half-finished."); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_mirror_disk_sync *sync; off_t offset; void *data; int i; if (bp->bio_error != 0) { G_MIRROR_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_MIRROR_BUMP_GENID; g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED, G_MIRROR_EVENT_DONTWAIT); return; } G_MIRROR_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset >= sc->sc_mediasize || sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_MIRROR); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { return; } /* Disk up-to-date, activate it. */ g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE, G_MIRROR_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_mirror_regular_release(sc); /* Find the smallest offset */ offset = sc->sc_mediasize; for (i = 0; i < g_mirror_syncreqs; i++) { bp = sync->ds_bios[i]; if (bp->bio_offset < offset) offset = bp->bio_offset; } if (sync->ds_offset_done + (MAXPHYS * 100) < offset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = offset; g_mirror_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static void g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE) break; } if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } static void g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; disk = g_mirror_get_disk(sc); if (disk == NULL) { if (bp->bio_error == 0) bp->bio_error = ENXIO; g_io_deliver(bp, bp->bio_error); return; } cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } #define TRACK_SIZE (1 * 1024 * 1024) #define LOAD_SCALE 256 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static void g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp) { struct g_mirror_disk *disk, *dp; struct g_consumer *cp; struct bio *cbp; int prio, best; /* Find a disk with the smallest load. */ disk = NULL; best = INT_MAX; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; prio = dp->load; /* If disk head is precisely in position - highly prefer it. */ if (dp->d_last_offset == bp->bio_offset) prio -= 2 * LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE) prio -= 1 * LOAD_SCALE; if (prio <= best) { disk = dp; best = prio; } } KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } /* * Fill in the component buf structure. */ cp = disk->d_consumer; cbp->bio_done = g_mirror_done; cbp->bio_to = cp->provider; G_MIRROR_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; /* Remember last head position */ disk->d_last_offset = bp->bio_offset + bp->bio_length; /* Update loads. */ LIST_FOREACH(dp, &sc->sc_disks, d_next) { dp->load = (dp->d_consumer->index * LOAD_SCALE + dp->load * 7) / 8; } g_io_request(cbp, cp); } static void g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_mirror_disk *disk; struct g_consumer *cp; struct bio *cbp; off_t left, mod, offset, slice; u_char *data; u_int ndisks; if (bp->bio_length <= sc->sc_slice) { g_mirror_request_round_robin(sc, bp); return; } ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE); slice = bp->bio_length / ndisks; mod = slice % sc->sc_provider->sectorsize; if (mod != 0) slice += sc->sc_provider->sectorsize - mod; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ left = bp->bio_length; offset = bp->bio_offset; data = bp->bio_data; bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; cbp->bio_offset = offset; cbp->bio_data = data; cbp->bio_length = MIN(left, slice); left -= cbp->bio_length; if (left == 0) break; offset += cbp->bio_length; data += cbp->bio_length; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); disk->d_consumer->index++; g_io_request(cbp, disk->d_consumer); } } static void g_mirror_register_request(struct bio *bp) { struct g_mirror_softc *sc; sc = bp->bio_to->geom->softc; switch (bp->bio_cmd) { case BIO_READ: switch (sc->sc_balance) { case G_MIRROR_BALANCE_LOAD: g_mirror_request_load(sc, bp); break; case G_MIRROR_BALANCE_PREFER: g_mirror_request_prefer(sc, bp); break; case G_MIRROR_BALANCE_ROUND_ROBIN: g_mirror_request_round_robin(sc, bp); break; case G_MIRROR_BALANCE_SPLIT: g_mirror_request_split(sc, bp); break; } return; case BIO_WRITE: case BIO_DELETE: { struct g_mirror_disk *disk; struct g_mirror_disk_sync *sync; struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; /* * Delay the request if it is colliding with a synchronization * request. */ if (g_mirror_sync_collision(sc, bp)) { g_mirror_regular_delay(sc, bp); return; } if (sc->sc_idle) g_mirror_unidle(sc); else sc->sc_last_write = time_uptime; /* * Allocate all bios before sending any request, so we can * return ENOMEM in nice and clean way. */ bioq_init(&queue); LIST_FOREACH(disk, &sc->sc_disks, d_next) { sync = &disk->d_sync; switch (disk->d_state) { case G_MIRROR_DISK_STATE_ACTIVE: break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: if (bp->bio_offset >= sync->ds_offset) continue; break; default: continue; } if (bp->bio_cmd == BIO_DELETE && (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_mirror_done; cp = disk->d_consumer; cbp->bio_caller1 = cp; cbp->bio_to = cp->provider; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); } if (bioq_first(&queue) == NULL) { g_io_deliver(bp, EOPNOTSUPP); return; } while ((cbp = bioq_takefirst(&queue)) != NULL) { G_MIRROR_LOGREQ(3, cbp, "Sending request."); cp = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, bp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; g_mirror_bump_syncid(sc); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_mirror_can_destroy(struct g_mirror_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0) return (0); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_mirror_is_busy(sc, cp)) return (0); } G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_mirror_try_destroy(struct g_mirror_softc *sc) { if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_mirror_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_mirror_destroy_device(sc); free(sc, M_MIRROR); } return (1); } /* * Worker thread. */ static void g_mirror_worker(void *arg) { struct g_mirror_softc *sc; struct g_mirror_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_MIRROR_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_mirror_event_get(sc); if (ep != NULL) { g_mirror_event_remove(sc, ep); if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) { /* Update only device status. */ G_MIRROR_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_mirror_update_device(sc, 1); } else { /* Update disk status. */ G_MIRROR_DEBUG(3, "Running event for disk %s.", g_mirror_get_diskname(ep->e_disk)); ep->e_error = g_mirror_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_mirror_update_device(sc, 0); } if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_mirror_event_free(ep); } else { ep->e_flags |= G_MIRROR_EVENT_DONE; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_mirror_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_takefirst(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_mirror_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_MIRROR_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__); continue; } mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) { g_mirror_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0) g_mirror_regular_request(bp); else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) g_mirror_sync_request(bp); /* WRITE */ else { KASSERT(0, - ("Invalid request cflags=0x%hhx to=%s.", + ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else { g_mirror_register_request(bp); } G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.", g_mirror_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; } } static void g_mirror_sync_start(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; struct g_consumer *cp; struct bio *bp; int error, i; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Disk %s is not marked for synchronization.", g_mirror_get_diskname(disk))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Device not in RUNNING state (%s, %u).", sc->sc_name, sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_mirror_get_diskname(disk)); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_mirror_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs, M_MIRROR, M_WAITOK); for (i = 0; i < g_mirror_syncreqs; i++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[i] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset; bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length; bp->bio_done = g_mirror_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)i; } /* Increase the number of disks in SYNCHRONIZING state. */ sc->sc_sync.ds_ndisks++; /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_mirror_syncreqs; /* * Fire off first synchronization requests. */ for (i = 0; i < g_mirror_syncreqs; i++) { bp = disk->d_sync.ds_bios[i]; G_MIRROR_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_mirror_regular_collision(sc, bp)) g_mirror_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type) { struct g_mirror_softc *sc; struct g_consumer *cp; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_mirror_get_diskname(disk)); } else /* if (type == 1) */ { G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_mirror_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_MIRROR); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; sc->sc_sync.ds_ndisks--; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_mirror_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_mirror_launch_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct g_provider *pp, *dp; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name); pp->flags |= G_PF_DIRECT_RECEIVE; pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; /* Splitting of unmapped BIO's could work but isn't implemented now */ if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT) pp->flags |= G_PF_ACCEPT_UNMAPPED; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer && disk->d_consumer->provider) { dp = disk->d_consumer->provider; if (dp->stripesize > pp->stripesize) { pp->stripesize = dp->stripesize; pp->stripeoffset = dp->stripeoffset; } /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_MIRROR_DEBUG(0, "Cancelling unmapped " "because of %s.", dp->name); pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } } sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_start(disk); } } static void g_mirror_destroy_provider(struct g_mirror_softc *sc) { struct g_mirror_disk *disk; struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) g_io_deliver(bp, ENXIO); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); sc->sc_provider->flags |= G_PF_WITHER; g_orphan_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) g_mirror_sync_stop(disk, 1); } } static void g_mirror_go(void *arg) { struct g_mirror_softc *sc; sc = arg; G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_mirror_event_send(sc, 0, G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE); } static u_int g_mirror_determine_state(struct g_mirror_disk *disk) { struct g_mirror_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_MIRROR_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) { state = G_MIRROR_DISK_STATE_SYNCHRONIZING; } else { state = G_MIRROR_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that mirror was started on stale disks * and more fresh disk just arrive. * If there were writes, mirror is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_MIRROR_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); state = G_MIRROR_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_MIRROR_DEBUG(3, "State for %s disk: %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force) { struct g_mirror_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; u_int dirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * if we have any disks and 'force' is true. */ ndisks = g_mirror_ndisks(sc, -1); if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) { ; } else if (ndisks == 0) { /* * Disks went down in starting phase, so destroy * device. */ callout_drain(&sc->sc_callout); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } else { return; } /* * Activate all disks with the biggest syncid. */ if (force) { /* * If 'force' is true, we have been called due to * timeout, so don't bother canceling timeout. */ ndisks = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) { ndisks++; } } if (ndisks == 0) { /* No valid disks found, destroy device. */ sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; return; } } else { /* Cancel timeout. */ callout_drain(&sc->sc_callout); } /* * Find the biggest genid. */ genid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_genid < genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", g_mirror_get_diskname(disk), sc->sc_name); g_mirror_destroy_disk(disk); } } /* * Find the biggest syncid. */ syncid = 0; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid > syncid) syncid = disk->d_sync.ds_syncid; } /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose * one with the biggest priority and rebuild the rest. */ /* * Find the number of dirty disks with the biggest syncid. * Find the number of disks with the biggest syncid. * While here, find a disk with the biggest priority. */ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) { dirty++; if (pdisk == NULL || pdisk->d_priority < disk->d_priority) { pdisk = disk; } } } if (dirty == 0) { /* No dirty disks at all, great. */ } else if (dirty == ndisks) { /* * Force synchronization for all dirty disks except one * with the biggest priority. */ KASSERT(pdisk != NULL, ("pdisk == NULL")); G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a " "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } KASSERT((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0, ("Disk %s isn't marked as dirty.", g_mirror_get_diskname(disk))); /* Skip the disk with the biggest priority. */ if (disk == pdisk) continue; disk->d_sync.ds_syncid = 0; } } else if (dirty < ndisks) { /* * Force synchronization for all dirty disks. * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_sync.ds_syncid != syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_sync.ds_syncid = 0; } } /* Reset hint. */ sc->sc_hint = NULL; sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } state = G_MIRROR_DEVICE_STATE_RUNNING; G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_device_state2str(state)); sc->sc_state = state; LIST_FOREACH(disk, &sc->sc_disks, d_next) { state = g_mirror_determine_state(disk); g_mirror_event_send(disk, state, G_MIRROR_EVENT_DONTWAIT); if (state == G_MIRROR_DISK_STATE_STALE) sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } break; } case G_MIRROR_DEVICE_STATE_RUNNING: if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * No active disks or no disks at all, * so destroy device. */ if (sc->sc_provider != NULL) g_mirror_destroy_provider(sc); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; break; } else if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0 && g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) { /* * We have active disks, launch provider if it doesn't * exist. */ if (sc->sc_provider == NULL) g_mirror_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } } /* * Genid should be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID; g_mirror_bump_genid(sc); } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_MIRROR_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_mirror_get_diskname(disk), \ g_mirror_disk_state2str(disk->d_state), \ g_mirror_disk_state2str(state), sc->sc_name) static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state) { struct g_mirror_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state), g_mirror_disk_state2str(state)); switch (state) { case G_MIRROR_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; if (LIST_EMPTY(&sc->sc_disks)) LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next); else { struct g_mirror_disk *dp; LIST_FOREACH(dp, &sc->sc_disks, d_next) { if (disk->d_priority >= dp->d_priority) { LIST_INSERT_BEFORE(dp, disk, d_next); dp = NULL; break; } if (LIST_NEXT(dp, d_next) == NULL) break; } if (dp != NULL) LIST_INSERT_AFTER(dp, disk, d_next); } G_MIRROR_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_mirror_get_diskname(disk)); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); state = g_mirror_determine_state(disk); if (state != G_MIRROR_DISK_STATE_NONE) goto again; break; case G_MIRROR_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC; g_mirror_sync_stop(disk, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_mirror_update_idle(sc, disk); g_mirror_update_metadata(disk); G_MIRROR_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; g_mirror_update_metadata(disk); G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_mirror_get_diskname(disk)); break; case G_MIRROR_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_MIRROR_DISK_STATE_NEW) disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_mirror_sync_start(disk); g_mirror_update_metadata(disk); } break; case G_MIRROR_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE || disk->d_state == G_MIRROR_DISK_STATE_STALE || disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_mirror_device_state2str(sc->sc_state), g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); break; case G_MIRROR_DISK_STATE_DESTROY: { int error; error = g_mirror_clear_metadata(disk); if (error != 0) return (error); DISK_STATE_CHANGED(); G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, g_mirror_get_diskname(disk)); g_mirror_destroy_disk(disk); sc->sc_ndisks--; LIST_FOREACH(disk, &sc->sc_disks, d_next) { g_mirror_update_metadata(disk); } break; } default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = mirror_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0) return (EINVAL); if (md->md_version > G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } static int g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if (md->md_slice != sc->sc_slice) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_slice", pp->name, sc->sc_name); return (EINVAL); } if (md->md_balance != sc->sc_balance) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_balance", pp->name, sc->sc_name); return (EINVAL); } #if 0 if (md->md_mediasize != sc->sc_mediasize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } #endif if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_MIRROR_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_MIRROR_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) { G_MIRROR_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp, struct g_mirror_metadata *md) { struct g_mirror_disk *disk; int error; g_topology_assert_not(); G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name); error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING && md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW, G_MIRROR_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_MIRROR_VERSION) { G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_MIRROR_VERSION); g_mirror_update_metadata(disk); } return (0); } static void g_mirror_destroy_delayed(void *arg, int flag) { struct g_mirror_softc *sc; int error; if (flag == EV_CANCEL) { G_MIRROR_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).", sc->sc_name, error); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_mirror_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_mirror_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 || LIST_EMPTY(&sc->sc_disks)) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_mirror_idle(sc, dcw); if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md) { struct g_mirror_softc *sc; struct g_geom *gp; int error, timeout; g_topology_assert(); G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_mid); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO); gp->start = g_mirror_start; gp->orphan = g_mirror_orphan; gp->access = g_mirror_access; gp->dumpconf = g_mirror_dumpconf; sc->sc_id = md->md_mid; sc->sc_slice = md->md_slice; sc->sc_balance = md->md_balance; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; sx_init(&sc->sc_lock, "gmirror:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); LIST_INIT(&sc->sc_disks); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF); sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_mirror_orphan; sc->sc_sync.ds_geom = gp; sc->sc_sync.ds_ndisks = 0; error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0, "g_mirror %s", md->md_name); if (error != 0) { G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_done_mtx); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_MIRROR); return (NULL); } G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GMIRROR"); G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = g_mirror_timeout * hz; callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc); return (sc->sc_geom); } int g_mirror_destroy(struct g_mirror_softc *sc, int how) { struct g_mirror_disk *disk; struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_MIRROR_DESTROY_SOFT: G_MIRROR_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_MIRROR_DESTROY_DELAYED: G_MIRROR_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { g_mirror_sync_stop(disk, 1); } } sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_MIRROR_DESTROY_HARD: G_MIRROR_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WAIT; G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); mtx_unlock(&sc->sc_queue_mtx); G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5); G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_mirror_destroy_device(sc); free(sc, M_MIRROR); return (0); } static void g_mirror_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_mirror_metadata md; struct g_mirror_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_MIRROR_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "mirror:taste"); /* * This orphan function should be never called. */ gp->orphan = g_mirror_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_mirror_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) { G_MIRROR_DEBUG(0, "Device %s: provider %s marked as inactive, skipping.", md.md_name, pp->name); return (NULL); } if (g_mirror_debug >= 2) mirror_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_mid != sc->sc_id) { G_MIRROR_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_mirror_create(mp, &md); if (gp == NULL) { G_MIRROR_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING; error = g_mirror_add_disk(sc, pp, &md); if (error != 0) { G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (LIST_EMPTY(&sc->sc_disks)) { g_cancel_event(sc); g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING; if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) { g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD); g_topology_lock(); return (NULL); } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static void g_mirror_resize(struct g_consumer *cp) { struct g_mirror_disk *disk; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name); disk = cp->private; if (disk == NULL) return; g_topology_unlock(); g_mirror_update_metadata(disk); g_topology_lock(); } static int g_mirror_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_mirror_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_mirror_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_mirror_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_id); if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / sc->sc_provider->mediasize)); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE"); ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, disk->d_priority); sbuf_printf(sb, "%s%s\n", indent, g_mirror_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_slice); sbuf_printf(sb, "%s%s\n", indent, balance_name(sc->sc_balance)); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s", indent); if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) sbuf_printf(sb, "%s", "STARTING"); else if (sc->sc_ndisks == g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE)) sbuf_printf(sb, "%s", "COMPLETE"); else sbuf_printf(sb, "%s", "DEGRADED"); sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_mirror_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_mirror_softc *sc; int error; mp = arg; DROP_GIANT(); g_topology_lock(); g_mirror_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_mirror_idle(sc, -1); g_cancel_event(sc); error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); PICKUP_GIANT(); } static void g_mirror_init(struct g_class *mp) { g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_mirror_post_sync == NULL) G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_mirror_fini(struct g_class *mp) { if (g_mirror_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); Index: head/sys/geom/raid3/g_raid3.c =================================================================== --- head/sys/geom/raid3/g_raid3.c (revision 297954) +++ head/sys/geom/raid3/g_raid3.c (revision 297955) @@ -1,3586 +1,3586 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_raid3, "GEOM RAID-3 functionality"); static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); u_int g_raid3_debug = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0, "Debug level"); static u_int g_raid3_timeout = 4; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout, 0, "Time to wait on all raid3 components"); static u_int g_raid3_idletime = 5; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN, &g_raid3_idletime, 0, "Mark components as clean when idling"); static u_int g_raid3_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid3_syncreqs = 2; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); static u_int g_raid3_use_malloc = 0; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN, &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9)."); static u_int g_raid3_n64k = 50; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0, "Maximum number of 64kB allocations"); static u_int g_raid3_n16k = 200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0, "Maximum number of 16kB allocations"); static u_int g_raid3_n4k = 1200; SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0, "Maximum number of 4kB allocations"); static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, "GEOM_RAID3 statistics"); static u_int g_raid3_parity_mismatch = 0; SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) static eventhandler_tag g_raid3_post_sync = NULL; static int g_raid3_shutdown = 0; static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid3_taste; static void g_raid3_init(struct g_class *mp); static void g_raid3_fini(struct g_class *mp); struct g_class g_raid3_class = { .name = G_RAID3_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid3_config, .taste = g_raid3_taste, .destroy_geom = g_raid3_destroy_geom, .init = g_raid3_init, .fini = g_raid3_fini }; static void g_raid3_destroy_provider(struct g_raid3_softc *sc); static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); static int g_raid3_register_request(struct bio *pbp); static void g_raid3_sync_release(struct g_raid3_softc *sc); static const char * g_raid3_disk_state2str(int state) { switch (state) { case G_RAID3_DISK_STATE_NODISK: return ("NODISK"); case G_RAID3_DISK_STATE_NONE: return ("NONE"); case G_RAID3_DISK_STATE_NEW: return ("NEW"); case G_RAID3_DISK_STATE_ACTIVE: return ("ACTIVE"); case G_RAID3_DISK_STATE_STALE: return ("STALE"); case G_RAID3_DISK_STATE_SYNCHRONIZING: return ("SYNCHRONIZING"); case G_RAID3_DISK_STATE_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } static const char * g_raid3_device_state2str(int state) { switch (state) { case G_RAID3_DEVICE_STATE_STARTING: return ("STARTING"); case G_RAID3_DEVICE_STATE_DEGRADED: return ("DEGRADED"); case G_RAID3_DEVICE_STATE_COMPLETE: return ("COMPLETE"); default: return ("INVALID"); } } const char * g_raid3_get_diskname(struct g_raid3_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_name); } static void * g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags) { void *ptr; enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) ptr = malloc(size, M_RAID3, flags); else { ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone, &sc->sc_zones[zone], flags); sc->sc_zones[zone].sz_requested++; if (ptr == NULL) sc->sc_zones[zone].sz_failed++; } return (ptr); } static void g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size) { enum g_raid3_zones zone; if (g_raid3_use_malloc || (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES) free(ptr, M_RAID3); else { uma_zfree_arg(sc->sc_zones[zone].sz_zone, ptr, &sc->sc_zones[zone]); } } static int g_raid3_uma_ctor(void *mem, int size, void *arg, int flags) { struct g_raid3_zone *sz = arg; if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max) return (ENOMEM); sz->sz_inuse++; return (0); } static void g_raid3_uma_dtor(void *mem, int size, void *arg) { struct g_raid3_zone *sz = arg; sz->sz_inuse--; } #define g_raid3_xor(src, dst, size) \ _g_raid3_xor((uint64_t *)(src), \ (uint64_t *)(dst), (size_t)size) static void _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size) { KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); for (; size > 0; size -= 128) { *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); *dst++ ^= (*src++); } } static int g_raid3_is_zero(struct bio *bp) { static const uint64_t zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; u_char *addr; ssize_t size; size = bp->bio_length; addr = (u_char *)bp->bio_data; for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { if (bcmp(addr, zeros, sizeof(zeros)) != 0) return (0); } return (1); } /* * --- Events handling functions --- * Events in geom_raid3 are used to maintain disks and device status * from one thread to simplify locking. */ static void g_raid3_event_free(struct g_raid3_event *ep) { free(ep, M_RAID3); } int g_raid3_event_send(void *arg, int state, int flags) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_raid3_event *ep; int error; ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); if ((flags & G_RAID3_EVENT_DEVICE) != 0) { disk = NULL; sc = arg; } else { disk = arg; sc = disk->d_softc; } ep->e_disk = disk; ep->e_state = state; ep->e_flags = flags; ep->e_error = 0; mtx_lock(&sc->sc_events_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { mtx_lock(&sc->sc_events_mtx); MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", hz * 5); } error = ep->e_error; g_raid3_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static struct g_raid3_event * g_raid3_event_get(struct g_raid3_softc *sc) { struct g_raid3_event *ep; mtx_lock(&sc->sc_events_mtx); ep = TAILQ_FIRST(&sc->sc_events); mtx_unlock(&sc->sc_events_mtx); return (ep); } static void g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) { mtx_lock(&sc->sc_events_mtx); TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_events_mtx); } static void g_raid3_event_cancel(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_event *ep, *tmpep; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_events_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) continue; if (ep->e_disk != disk) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_events_mtx); } /* * Return the number of disks in the given state. * If state is equal to -1, count all connected disks. */ u_int g_raid3_ndisks(struct g_raid3_softc *sc, int state) { struct g_raid3_disk *disk; u_int n, ndisks; sx_assert(&sc->sc_lock, SX_LOCKED); for (n = ndisks = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (state == -1 || disk->d_state == state) ndisks++; } return (ndisks); } static u_int g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } static int g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID3_DEBUG(2, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid3_nrequests(sc, cp) > 0) { G_RAID3_DEBUG(2, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid3_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } static void g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert(); cp->private = NULL; if (g_raid3_is_busy(sc, cp)) return; G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, -cp->acw, -cp->ace, 0); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); return; } G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); } static int g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) { struct g_consumer *cp; int error; g_topology_assert_not(); KASSERT(disk->d_consumer == NULL, ("Disk already connected (device %s).", disk->d_softc->sc_name)); g_topology_lock(); cp = g_new_consumer(disk->d_softc->sc_geom); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); g_topology_unlock(); return (error); } error = g_access(cp, 1, 1, 1); g_topology_unlock(); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", pp->name, error); return (error); } disk->d_consumer = cp; disk->d_consumer->private = disk; disk->d_consumer->index = 0; G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); return (0); } static void g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) { g_topology_assert(); if (cp == NULL) return; if (cp->provider != NULL) g_raid3_kill_consumer(sc, cp); else g_destroy_consumer(cp); } /* * Initialize disk. This means allocate memory, create consumer, attach it * to the provider and open access (r1w1e1) to it. */ static struct g_raid3_disk * g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md, int *errorp) { struct g_raid3_disk *disk; int error; disk = &sc->sc_disks[md->md_no]; error = g_raid3_connect_disk(disk, pp); if (error != 0) { if (errorp != NULL) *errorp = error; return (NULL); } disk->d_state = G_RAID3_DISK_STATE_NONE; disk->d_flags = md->md_dflags; if (md->md_provider[0] != '\0') disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; disk->d_sync.ds_consumer = NULL; disk->d_sync.ds_offset = md->md_sync_offset; disk->d_sync.ds_offset_done = md->md_sync_offset; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; if (errorp != NULL) *errorp = 0; return (disk); } static void g_raid3_destroy_disk(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); if (disk->d_state == G_RAID3_DISK_STATE_NODISK) return; g_raid3_event_cancel(disk); switch (disk->d_state) { case G_RAID3_DISK_STATE_SYNCHRONIZING: if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); /* FALLTHROUGH */ case G_RAID3_DISK_STATE_NEW: case G_RAID3_DISK_STATE_STALE: case G_RAID3_DISK_STATE_ACTIVE: g_topology_lock(); g_raid3_disconnect_consumer(sc, disk->d_consumer); g_topology_unlock(); disk->d_consumer = NULL; break; default: KASSERT(0 == 1, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } disk->d_state = G_RAID3_DISK_STATE_NODISK; } static void g_raid3_destroy_device(struct g_raid3_softc *sc) { struct g_raid3_event *ep; struct g_raid3_disk *disk; struct g_geom *gp; struct g_consumer *cp; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); gp = sc->sc_geom; if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); g_raid3_destroy_disk(disk); } } while ((ep = g_raid3_event_get(sc)) != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) g_raid3_event_free(ep); else { ep->e_error = ECANCELED; ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } } callout_drain(&sc->sc_callout); cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); g_topology_lock(); if (cp != NULL) g_raid3_disconnect_consumer(sc, cp); g_wither_geom(sc->sc_sync.ds_geom, ENXIO); G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); g_topology_unlock(); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } mtx_destroy(&sc->sc_queue_mtx); mtx_destroy(&sc->sc_events_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); } static void g_raid3_orphan(struct g_consumer *cp) { struct g_raid3_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } static int g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_consumer *cp; off_t offset, length; u_char *sector; int error = 0; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); cp = disk->d_consumer; KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); length = cp->provider->sectorsize; offset = cp->provider->mediasize - length; sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); if (md != NULL) raid3_metadata_encode(md, sector); error = g_write_data(cp, offset, sector, length); free(sector, M_RAID3); if (error != 0) { if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { G_RAID3_DEBUG(0, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; } else { G_RAID3_DEBUG(1, "Cannot write metadata on %s " "(device=%s, error=%d).", g_raid3_get_diskname(disk), sc->sc_name, error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } return (error); } int g_raid3_clear_metadata(struct g_raid3_disk *disk) { int error; g_topology_assert_not(); sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); error = g_raid3_write_metadata(disk, NULL); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s cleared.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot clear metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } return (error); } void g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_provider *pp; sc = disk->d_softc; strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); md->md_version = G_RAID3_VERSION; strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); md->md_id = sc->sc_id; md->md_all = sc->sc_ndisks; md->md_genid = sc->sc_genid; md->md_mediasize = sc->sc_mediasize; md->md_sectorsize = sc->sc_sectorsize; md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); md->md_no = disk->d_no; md->md_syncid = disk->d_sync.ds_syncid; md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) md->md_sync_offset = 0; else { md->md_sync_offset = disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1); } if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) pp = disk->d_consumer->provider; else pp = NULL; if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); else bzero(md->md_provider, sizeof(md->md_provider)); if (pp != NULL) md->md_provsize = pp->mediasize; else md->md_provsize = 0; } void g_raid3_update_metadata(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; struct g_raid3_metadata md; int error; g_topology_assert_not(); sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_raid3_fill_metadata(disk, &md); error = g_raid3_write_metadata(disk, &md); if (error == 0) { G_RAID3_DEBUG(2, "Metadata on %s updated.", g_raid3_get_diskname(disk)); } else { G_RAID3_DEBUG(0, "Cannot update metadata on disk %s (error=%d).", g_raid3_get_diskname(disk), error); } } static void g_raid3_bump_syncid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_syncid++; G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, sc->sc_syncid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_sync.ds_syncid = sc->sc_syncid; g_raid3_update_metadata(disk); } } } static void g_raid3_bump_genid(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, ("%s called with no active disks (device=%s).", __func__, sc->sc_name)); sc->sc_genid++; G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, sc->sc_genid); for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_genid = sc->sc_genid; g_raid3_update_metadata(disk); } } } static int g_raid3_idle(struct g_raid3_softc *sc, int acw) { struct g_raid3_disk *disk; u_int i; int timeout; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_provider == NULL) return (0); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return (0); if (sc->sc_idle) return (0); if (sc->sc_writes > 0) return (0); if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); if (!g_raid3_shutdown && timeout > 0) return (timeout); } sc->sc_idle = 1; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } return (0); } static void g_raid3_unidle(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; u_int i; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; sc->sc_idle = 0; sc->sc_last_write = time_uptime; for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; g_raid3_update_metadata(disk); } } /* * Treat bio_driver1 field in parent bio as list head and field bio_caller1 * in child bio as pointer to the next element on the list. */ #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 #define G_RAID3_FOREACH_BIO(pbp, bp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ (bp) = G_RAID3_NEXT_BIO(bp)) #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ for ((bp) = G_RAID3_HEAD_BIO(pbp); \ (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ (bp) = (tmpbp)) static void g_raid3_init_bio(struct bio *pbp) { G_RAID3_HEAD_BIO(pbp) = NULL; } static void g_raid3_remove_bio(struct bio *cbp) { struct bio *pbp, *bp; pbp = cbp->bio_parent; if (G_RAID3_HEAD_BIO(pbp) == cbp) G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) { G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); break; } } } G_RAID3_NEXT_BIO(cbp) = NULL; } static void g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) { struct bio *pbp, *bp; g_raid3_remove_bio(sbp); pbp = dbp->bio_parent; G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); if (G_RAID3_HEAD_BIO(pbp) == dbp) G_RAID3_HEAD_BIO(pbp) = sbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == dbp) { G_RAID3_NEXT_BIO(bp) = sbp; break; } } } G_RAID3_NEXT_BIO(dbp) = NULL; } static void g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) { struct bio *bp, *pbp; size_t size; pbp = cbp->bio_parent; pbp->bio_children--; KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); size = pbp->bio_length / (sc->sc_ndisks - 1); g_raid3_free(sc, cbp->bio_data, size); if (G_RAID3_HEAD_BIO(pbp) == cbp) { G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; g_destroy_bio(cbp); } else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == cbp) break; } if (bp != NULL) { KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, ("NULL bp->bio_driver1")); G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); G_RAID3_NEXT_BIO(cbp) = NULL; } g_destroy_bio(cbp); } } static struct bio * g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) { struct bio *bp, *cbp; size_t size; int memflag; cbp = g_clone_bio(pbp); if (cbp == NULL) return (NULL); size = pbp->bio_length / (sc->sc_ndisks - 1); if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) memflag = M_WAITOK; else memflag = M_NOWAIT; cbp->bio_data = g_raid3_alloc(sc, size, memflag); if (cbp->bio_data == NULL) { pbp->bio_children--; g_destroy_bio(cbp); return (NULL); } G_RAID3_NEXT_BIO(cbp) = NULL; if (G_RAID3_HEAD_BIO(pbp) == NULL) G_RAID3_HEAD_BIO(pbp) = cbp; else { G_RAID3_FOREACH_BIO(pbp, bp) { if (G_RAID3_NEXT_BIO(bp) == NULL) { G_RAID3_NEXT_BIO(bp) = cbp; break; } } } return (cbp); } static void g_raid3_scatter(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *bp, *cbp, *tmpbp; off_t atom, cadd, padd, left; int first; sc = pbp->bio_to->geom->softc; bp = NULL; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Find bio for which we should calculate data. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { bp = cbp; break; } } KASSERT(bp != NULL, ("NULL parity bio.")); } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { if (cbp == bp) continue; bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); padd += atom; } cadd += atom; } if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { /* * Calculate parity. */ first = 1; G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { if (cbp == bp) continue; if (first) { bcopy(cbp->bio_data, bp->bio_data, bp->bio_length); first = 0; } else { g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_length); } if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) g_raid3_destroy_bio(sc, cbp); } } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { struct g_consumer *cp; disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; sc->sc_writes++; g_io_request(cbp, cp); } } static void g_raid3_gather(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *xbp, *fbp, *cbp; off_t atom, cadd, padd, left; sc = pbp->bio_to->geom->softc; /* * Find bio for which we have to calculate data. * While going through this path, check if all requests * succeeded, if not, deny whole request. * If we're in COMPLETE mode, we allow one request to fail, * so if we find one, we're sending it to the parity consumer. * If there are more failed requests, we deny whole request. */ xbp = fbp = NULL; G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { KASSERT(xbp == NULL, ("More than one parity bio.")); xbp = cbp; } if (cbp->bio_error == 0) continue; /* * Found failed request. */ if (fbp == NULL) { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { /* * We are already in degraded mode, so we can't * accept any failures. */ if (pbp->bio_error == 0) pbp->bio_error = cbp->bio_error; } else { fbp = cbp; } } else { /* * Next failed request, that's too many. */ if (pbp->bio_error == 0) pbp->bio_error = fbp->bio_error; } disk = cbp->bio_caller2; if (disk == NULL) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } } if (pbp->bio_error != 0) goto finish; if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; if (xbp != fbp) g_raid3_replace_bio(xbp, fbp); g_raid3_destroy_bio(sc, fbp); } else if (fbp != NULL) { struct g_consumer *cp; /* * One request failed, so send the same request to * the parity consumer. */ disk = pbp->bio_driver2; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { pbp->bio_error = fbp->bio_error; goto finish; } pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_inbed--; fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); if (disk->d_no == sc->sc_ndisks - 1) fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; fbp->bio_error = 0; fbp->bio_completed = 0; fbp->bio_children = 0; fbp->bio_inbed = 0; cp = disk->d_consumer; fbp->bio_caller2 = disk; fbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(fbp, cp); return; } if (xbp != NULL) { /* * Calculate parity. */ G_RAID3_FOREACH_BIO(pbp, cbp) { if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) continue; g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_length); } xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { if (!g_raid3_is_zero(xbp)) { g_raid3_parity_mismatch++; pbp->bio_error = EIO; goto finish; } g_raid3_destroy_bio(sc, xbp); } } atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); cadd = padd = 0; for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { G_RAID3_FOREACH_BIO(pbp, cbp) { bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); pbp->bio_completed += atom; padd += atom; } cadd += atom; } finish: if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else { if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) G_RAID3_LOGREQ(1, pbp, "Verification error."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); } pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); g_io_deliver(pbp, pbp->bio_error); } static void g_raid3_done(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_regular_request(struct bio *cbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct bio *pbp; g_topology_assert_not(); pbp = cbp->bio_parent; sc = pbp->bio_to->geom->softc; cbp->bio_from->index--; if (cbp->bio_cmd == BIO_WRITE) sc->sc_writes--; disk = cbp->bio_from->private; if (disk == NULL) { g_topology_lock(); g_raid3_kill_consumer(sc, cbp->bio_from); g_topology_unlock(); } G_RAID3_LOGREQ(3, cbp, "Request finished."); pbp->bio_inbed++; KASSERT(pbp->bio_inbed <= pbp->bio_children, ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, pbp->bio_children)); if (pbp->bio_inbed != pbp->bio_children) return; switch (pbp->bio_cmd) { case BIO_READ: g_raid3_gather(pbp); break; case BIO_WRITE: case BIO_DELETE: { int error = 0; pbp->bio_completed = pbp->bio_length; while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { if (cbp->bio_error == 0) { g_raid3_destroy_bio(sc, cbp); continue; } if (error == 0) error = cbp->bio_error; else if (pbp->bio_error == 0) { /* * Next failed request, that's too many. */ pbp->bio_error = error; } disk = cbp->bio_caller2; if (disk == NULL) { g_raid3_destroy_bio(sc, cbp); continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", cbp->bio_error); } else { G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", cbp->bio_error); } if (g_raid3_disconnect_on_failure && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); } g_raid3_destroy_bio(sc, cbp); } if (pbp->bio_error == 0) G_RAID3_LOGREQ(3, pbp, "Request finished."); else G_RAID3_LOGREQ(0, pbp, "Request failed."); pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; bioq_remove(&sc->sc_inflight, pbp); /* Release delayed sync requests if possible. */ g_raid3_sync_release(sc); g_io_deliver(pbp, pbp->bio_error); break; } } } static void g_raid3_sync_done(struct bio *bp) { struct g_raid3_softc *sc; G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); sc = bp->bio_from->geom->softc; bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); } static void g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp; u_int i; bioq_init(&queue); for (i = 0; i < sc->sc_ndisks; i++) { disk = &sc->sc_disks[i]; if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) continue; cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_std_done; cbp->bio_caller1 = disk; cbp->bio_to = disk->d_consumer->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_RAID3_LOGREQ(3, cbp, "Sending request."); disk = cbp->bio_caller1; cbp->bio_caller1 = NULL; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); g_io_request(cbp, disk->d_consumer); } } static void g_raid3_start(struct bio *bp) { struct g_raid3_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid3_start() should not be called at all. */ KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_RAID3_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_FLUSH: g_raid3_flush(sc, bp); return; case BIO_GETATTR: default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); wakeup(sc); } /* * Return TRUE if the given request is colliding with a in-progress * synchronization request. */ static int g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) { struct g_raid3_disk *disk; struct bio *sbp; off_t rstart, rend, sstart, send; int i; disk = sc->sc_syncdisk; if (disk == NULL) return (0); rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; for (i = 0; i < g_raid3_syncreqs; i++) { sbp = disk->d_sync.ds_bios[i]; if (sbp == NULL) continue; sstart = sbp->bio_offset; send = sbp->bio_length; if (sbp->bio_cmd == BIO_WRITE) { sstart *= sc->sc_ndisks - 1; send *= sc->sc_ndisks - 1; } send += sstart; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Return TRUE if the given sync request is colliding with a in-progress regular * request. */ static int g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) { off_t rstart, rend, sstart, send; struct bio *bp; if (sc->sc_syncdisk == NULL) return (0); sstart = sbp->bio_offset; send = sstart + sbp->bio_length; TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { rstart = bp->bio_offset; rend = bp->bio_offset + bp->bio_length; if (rend > sstart && rstart < send) return (1); } return (0); } /* * Puts request onto delayed queue. */ static void g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying request."); bioq_insert_head(&sc->sc_regular_delayed, bp); } /* * Puts synchronization request onto delayed queue. */ static void g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) { G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); bioq_insert_tail(&sc->sc_sync_delayed, bp); } /* * Releases delayed regular requests which don't collide anymore with sync * requests. */ static void g_raid3_regular_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { if (g_raid3_sync_collision(sc, bp)) continue; bioq_remove(&sc->sc_regular_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); #if 0 /* * wakeup() is not needed, because this function is called from * the worker thread. */ wakeup(&sc->sc_queue); #endif mtx_unlock(&sc->sc_queue_mtx); } } /* * Releases delayed sync requests which don't collide anymore with regular * requests. */ static void g_raid3_sync_release(struct g_raid3_softc *sc) { struct bio *bp, *bp2; TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { if (g_raid3_regular_collision(sc, bp)) continue; bioq_remove(&sc->sc_sync_delayed, bp); G_RAID3_LOGREQ(2, bp, "Releasing delayed synchronization request."); g_io_request(bp, bp->bio_from); } } /* * Handle synchronization requests. * Every synchronization request is two-steps process: first, READ request is * send to active provider and then WRITE request (with read data) to the provider * beeing synchronized. When WRITE is finished, new synchronization request is * send. */ static void g_raid3_sync_request(struct bio *bp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; bp->bio_from->index--; sc = bp->bio_from->geom->softc; disk = bp->bio_from->private; if (disk == NULL) { sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, bp->bio_from); g_topology_unlock(); free(bp->bio_data, M_RAID3); g_destroy_bio(bp); sx_xlock(&sc->sc_lock); return; } /* * Synchronization request. */ switch (bp->bio_cmd) { case BIO_READ: { struct g_consumer *cp; u_char *dst, *src; off_t left; u_int atom; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); dst = src = bp->bio_data; if (disk->d_no == sc->sc_ndisks - 1) { u_int n; /* Parity component. */ for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += atom; for (n = 1; n < sc->sc_ndisks - 1; n++) { g_raid3_xor(src, dst, atom); src += atom; } dst += atom; } } else { /* Regular component. */ src += atom * disk->d_no; for (left = bp->bio_length; left > 0; left -= sc->sc_sectorsize) { bcopy(src, dst, atom); src += sc->sc_sectorsize; dst += atom; } } bp->bio_driver1 = bp->bio_driver2 = NULL; bp->bio_pflags = 0; bp->bio_offset /= sc->sc_ndisks - 1; bp->bio_length /= sc->sc_ndisks - 1; bp->bio_cmd = BIO_WRITE; bp->bio_cflags = 0; bp->bio_children = bp->bio_inbed = 0; cp = disk->d_consumer; KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(bp, cp); return; } case BIO_WRITE: { struct g_raid3_disk_sync *sync; off_t boffset, moffset; void *data; int i; if (bp->bio_error != 0) { G_RAID3_LOGREQ(0, bp, "Synchronization request failed (error=%d).", bp->bio_error); g_destroy_bio(bp); sc->sc_bump_id |= G_RAID3_BUMP_GENID; g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, G_RAID3_EVENT_DONTWAIT); return; } G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); sync = &disk->d_sync; if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { /* Don't send more synchronization requests. */ sync->ds_inflight--; if (sync->ds_bios != NULL) { i = (int)(uintptr_t)bp->bio_caller1; sync->ds_bios[i] = NULL; } free(bp->bio_data, M_RAID3); g_destroy_bio(bp); if (sync->ds_inflight > 0) return; if (sync->ds_consumer == NULL || (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { return; } /* * Disk up-to-date, activate it. */ g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, G_RAID3_EVENT_DONTWAIT); return; } /* Send next synchronization request. */ data = bp->bio_data; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_data = data; bp->bio_from = sync->ds_consumer; bp->bio_to = sc->sc_provider; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); sync->ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, sync->ds_consumer); /* Release delayed requests if possible. */ g_raid3_regular_release(sc); /* Find the smallest offset. */ moffset = sc->sc_mediasize; for (i = 0; i < g_raid3_syncreqs; i++) { bp = sync->ds_bios[i]; boffset = bp->bio_offset; if (bp->bio_cmd == BIO_WRITE) boffset *= sc->sc_ndisks - 1; if (boffset < moffset) moffset = boffset; } if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) { /* Update offset_done on every 100 blocks. */ sync->ds_offset_done = moffset; g_raid3_update_metadata(disk); } return; } default: KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", bp->bio_cmd, sc->sc_name)); break; } } static int g_raid3_register_request(struct bio *pbp) { struct g_raid3_softc *sc; struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *cbp, *tmpbp; off_t offset, length; u_int n, ndisks; int round_robin, verify; ndisks = 0; sc = pbp->bio_to->geom->softc; if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && sc->sc_syncdisk == NULL) { g_io_deliver(pbp, EIO); return (0); } g_raid3_init_bio(pbp); length = pbp->bio_length / (sc->sc_ndisks - 1); offset = pbp->bio_offset / (sc->sc_ndisks - 1); round_robin = verify = 0; switch (pbp->bio_cmd) { case BIO_READ: if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; verify = 1; ndisks = sc->sc_ndisks; } else { verify = 0; ndisks = sc->sc_ndisks - 1; } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { round_robin = 1; } else { round_robin = 0; } KASSERT(!round_robin || !verify, ("ROUND-ROBIN and VERIFY are mutually exclusive.")); pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; break; case BIO_WRITE: case BIO_DELETE: /* * Delay the request if it is colliding with a synchronization * request. */ if (g_raid3_sync_collision(sc, pbp)) { g_raid3_regular_delay(sc, pbp); return (0); } if (sc->sc_idle) g_raid3_unidle(sc); else sc->sc_last_write = time_uptime; ndisks = sc->sc_ndisks; break; } for (n = 0; n < ndisks; n++) { disk = &sc->sc_disks[n]; cbp = g_raid3_clone_bio(sc, pbp); if (cbp == NULL) { while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) g_raid3_destroy_bio(sc, cbp); /* * To prevent deadlock, we must run back up * with the ENOMEM for failed requests of any * of our consumers. Our own sync requests * can stick around, as they are finite. */ if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) { g_io_deliver(pbp, ENOMEM); return (0); } return (ENOMEM); } cbp->bio_offset = offset; cbp->bio_length = length; cbp->bio_done = g_raid3_done; switch (pbp->bio_cmd) { case BIO_READ: if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { /* * Replace invalid component with the parity * component. */ disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; } else if (round_robin && disk->d_no == sc->sc_round_robin) { /* * In round-robin mode skip one data component * and use parity component when reading. */ pbp->bio_driver2 = disk; disk = &sc->sc_disks[sc->sc_ndisks - 1]; cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; sc->sc_round_robin++; round_robin = 0; } else if (verify && disk->d_no == sc->sc_ndisks - 1) { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } break; case BIO_WRITE: case BIO_DELETE: if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { if (n == ndisks - 1) { /* * Active parity component, mark it as such. */ cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; } } else { pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; if (n == ndisks - 1) { /* * Parity component is not connected, * so destroy its request. */ pbp->bio_pflags |= G_RAID3_BIO_PFLAG_NOPARITY; g_raid3_destroy_bio(sc, cbp); cbp = NULL; } else { cbp->bio_cflags |= G_RAID3_BIO_CFLAG_NODISK; disk = NULL; } } break; } if (cbp != NULL) cbp->bio_caller2 = disk; } switch (pbp->bio_cmd) { case BIO_READ: if (round_robin) { /* * If we are in round-robin mode and 'round_robin' is * still 1, it means, that we skipped parity component * for this read and must reset sc_round_robin field. */ sc->sc_round_robin = 0; } G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { disk = cbp->bio_caller2; cp = disk->d_consumer; cbp->bio_to = cp->provider; G_RAID3_LOGREQ(3, cbp, "Sending request."); KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr, cp->acw, cp->ace)); cp->index++; g_io_request(cbp, cp); } break; case BIO_WRITE: case BIO_DELETE: /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. */ bioq_insert_tail(&sc->sc_inflight, pbp); /* * Bump syncid on first write. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; g_raid3_bump_syncid(sc); } g_raid3_scatter(pbp); break; } return (0); } static int g_raid3_can_destroy(struct g_raid3_softc *sc) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); gp = sc->sc_geom; if (gp->softc == NULL) return (1); LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } gp = sc->sc_sync.ds_geom; LIST_FOREACH(cp, &gp->consumer, consumer) { if (g_raid3_is_busy(sc, cp)) return (0); } G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", sc->sc_name); return (1); } static int g_raid3_try_destroy(struct g_raid3_softc *sc) { g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } g_topology_lock(); if (!g_raid3_can_destroy(sc)) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { g_topology_unlock(); G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, &sc->sc_worker); /* Unlock sc_lock here, as it can be destroyed after wakeup. */ sx_xunlock(&sc->sc_lock); wakeup(&sc->sc_worker); sc->sc_worker = NULL; } else { g_topology_unlock(); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); } return (1); } /* * Worker thread. */ static void g_raid3_worker(void *arg) { struct g_raid3_softc *sc; struct g_raid3_event *ep; struct bio *bp; int timeout; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { G_RAID3_DEBUG(5, "%s: Let's see...", __func__); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = g_raid3_event_get(sc); if (ep != NULL) { g_raid3_event_remove(sc, ep); if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { /* Update only device status. */ G_RAID3_DEBUG(3, "Running event for device %s.", sc->sc_name); ep->e_error = 0; g_raid3_update_device(sc, 1); } else { /* Update disk status. */ G_RAID3_DEBUG(3, "Running event for disk %s.", g_raid3_get_diskname(ep->e_disk)); ep->e_error = g_raid3_update_disk(ep->e_disk, ep->e_state); if (ep->e_error == 0) g_raid3_update_device(sc, 0); } if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid3_event_free(ep); } else { ep->e_flags |= G_RAID3_EVENT_DONE; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); mtx_lock(&sc->sc_events_mtx); wakeup(ep); mtx_unlock(&sc->sc_events_mtx); } if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } } G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); continue; } /* * Check if we can mark array as CLEAN and if we can't take * how much seconds should we wait. */ timeout = g_raid3_idle(sc, -1); /* * Now I/O requests. */ /* Get first request from the queue. */ mtx_lock(&sc->sc_queue_mtx); bp = bioq_first(&sc->sc_queue); if (bp == NULL) { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { mtx_unlock(&sc->sc_queue_mtx); if (g_raid3_try_destroy(sc)) { curthread->td_pflags &= ~TDP_GEOM; G_RAID3_DEBUG(1, "Thread exiting."); kproc_exit(0); } mtx_lock(&sc->sc_queue_mtx); } sx_xunlock(&sc->sc_lock); /* * XXX: We can miss an event here, because an event * can be added without sx-device-lock and without * mtx-queue-lock. Maybe I should just stop using * dedicated mutex for events synchronization and * stick with the queue lock? * The event will hang here until next I/O request * or next event is received. */ MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", timeout * hz); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); continue; } process: bioq_remove(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from->geom == sc->sc_sync.ds_geom && (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) { g_raid3_sync_request(bp); /* READ */ } else if (bp->bio_to != sc->sc_provider) { if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) g_raid3_regular_request(bp); else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) g_raid3_sync_request(bp); /* WRITE */ else { KASSERT(0, - ("Invalid request cflags=0x%hhx to=%s.", + ("Invalid request cflags=0x%hx to=%s.", bp->bio_cflags, bp->bio_to->name)); } } else if (g_raid3_register_request(bp) != 0) { mtx_lock(&sc->sc_queue_mtx); bioq_insert_head(&sc->sc_queue, bp); /* * We are short in memory, let see if there are finished * request we can free. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) goto process; } /* * No finished regular request, so at least keep * synchronization running. */ TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) goto process; } sx_xunlock(&sc->sc_lock); MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:lowmem", hz / 10); sx_xlock(&sc->sc_lock); } G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); } } static void g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) { sx_assert(&sc->sc_lock, SX_LOCKED); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0) return; if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; } else if (sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", g_raid3_get_diskname(disk), sc->sc_name); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; } } static void g_raid3_sync_start(struct g_raid3_softc *sc) { struct g_raid3_disk *disk; struct g_consumer *cp; struct bio *bp; int error; u_int n; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", sc->sc_name, sc->sc_state)); disk = NULL; for (n = 0; n < sc->sc_ndisks; n++) { if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) continue; disk = &sc->sc_disks[n]; break; } if (disk == NULL) return; sx_xunlock(&sc->sc_lock); g_topology_lock(); cp = g_new_consumer(sc->sc_sync.ds_geom); error = g_attach(cp, sc->sc_provider); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", sc->sc_name, error)); error = g_access(cp, 1, 0, 0); KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); g_topology_unlock(); sx_xlock(&sc->sc_lock); G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, g_raid3_get_diskname(disk)); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0) disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; KASSERT(disk->d_sync.ds_consumer == NULL, ("Sync consumer already exists (device=%s, disk=%s).", sc->sc_name, g_raid3_get_diskname(disk))); disk->d_sync.ds_consumer = cp; disk->d_sync.ds_consumer->private = disk; disk->d_sync.ds_consumer->index = 0; sc->sc_syncdisk = disk; /* * Allocate memory for synchronization bios and initialize them. */ disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, M_RAID3, M_WAITOK); for (n = 0; n < g_raid3_syncreqs; n++) { bp = g_alloc_bio(); disk->d_sync.ds_bios[n] = bp; bp->bio_parent = NULL; bp->bio_cmd = BIO_READ; bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); bp->bio_cflags = 0; bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); bp->bio_done = g_raid3_sync_done; bp->bio_from = disk->d_sync.ds_consumer; bp->bio_to = sc->sc_provider; bp->bio_caller1 = (void *)(uintptr_t)n; } /* Set the number of in-flight synchronization requests. */ disk->d_sync.ds_inflight = g_raid3_syncreqs; /* * Fire off first synchronization requests. */ for (n = 0; n < g_raid3_syncreqs; n++) { bp = disk->d_sync.ds_bios[n]; G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); disk->d_sync.ds_consumer->index++; /* * Delay the request if it is colliding with a regular request. */ if (g_raid3_regular_collision(sc, bp)) g_raid3_sync_delay(sc, bp); else g_io_request(bp, disk->d_sync.ds_consumer); } } /* * Stop synchronization process. * type: 0 - synchronization finished * 1 - synchronization stopped */ static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type) { struct g_raid3_disk *disk; struct g_consumer *cp; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_LOCKED); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, ("Device not in DEGRADED state (%s, %u).", sc->sc_name, sc->sc_state)); disk = sc->sc_syncdisk; sc->sc_syncdisk = NULL; KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); if (disk->d_sync.ds_consumer == NULL) return; if (type == 0) { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", sc->sc_name, g_raid3_get_diskname(disk)); } else /* if (type == 1) */ { G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", sc->sc_name, g_raid3_get_diskname(disk)); } free(disk->d_sync.ds_bios, M_RAID3); disk->d_sync.ds_bios = NULL; cp = disk->d_sync.ds_consumer; disk->d_sync.ds_consumer = NULL; disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ g_topology_lock(); g_raid3_kill_consumer(sc, cp); g_topology_unlock(); sx_xlock(&sc->sc_lock); } static void g_raid3_launch_provider(struct g_raid3_softc *sc) { struct g_provider *pp; struct g_raid3_disk *disk; int n; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); pp->mediasize = sc->sc_mediasize; pp->sectorsize = sc->sc_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_consumer && disk->d_consumer->provider && disk->d_consumer->provider->stripesize > pp->stripesize) { pp->stripesize = disk->d_consumer->provider->stripesize; pp->stripeoffset = disk->d_consumer->provider->stripeoffset; } } pp->stripesize *= sc->sc_ndisks - 1; pp->stripeoffset *= sc->sc_ndisks - 1; sc->sc_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name, g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks); if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) g_raid3_sync_start(sc); } static void g_raid3_destroy_provider(struct g_raid3_softc *sc) { struct bio *bp; g_topology_assert_not(); KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", sc->sc_name)); g_topology_lock(); g_error_provider(sc->sc_provider, ENXIO); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_first(&sc->sc_queue)) != NULL) { bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, sc->sc_provider->name); sc->sc_provider->flags |= G_PF_WITHER; g_orphan_provider(sc->sc_provider, ENXIO); g_topology_unlock(); sc->sc_provider = NULL; if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); } static void g_raid3_go(void *arg) { struct g_raid3_softc *sc; sc = arg; G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); g_raid3_event_send(sc, 0, G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); } static u_int g_raid3_determine_state(struct g_raid3_disk *disk) { struct g_raid3_softc *sc; u_int state; sc = disk->d_softc; if (sc->sc_syncid == disk->d_sync.ds_syncid) { if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { /* Disk does not need synchronization. */ state = G_RAID3_DISK_STATE_ACTIVE; } else { if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { /* * We can start synchronization from * the stored offset. */ state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { /* * Reset all synchronization data for this disk, * because if it even was synchronized, it was * synchronized to disks with different syncid. */ disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; disk->d_sync.ds_syncid = sc->sc_syncid; if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { state = G_RAID3_DISK_STATE_SYNCHRONIZING; } else { state = G_RAID3_DISK_STATE_STALE; } } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { /* * Not good, NOT GOOD! * It means that device was started on stale disks * and more fresh disk just arrive. * If there were writes, device is broken, sorry. * I think the best choice here is don't touch * this disk and inform the user loudly. */ G_RAID3_DEBUG(0, "Device %s was started before the freshest " "disk (%s) arrives!! It will not be connected to the " "running device.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); state = G_RAID3_DISK_STATE_NONE; /* Return immediately, because disk was destroyed. */ return (state); } G_RAID3_DEBUG(3, "State for %s disk: %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); return (state); } /* * Update device state. */ static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) { struct g_raid3_disk *disk; u_int state; sx_assert(&sc->sc_lock, SX_XLOCKED); switch (sc->sc_state) { case G_RAID3_DEVICE_STATE_STARTING: { u_int n, ndirty, ndisks, genid, syncid; KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* * Are we ready? We are, if all disks are connected or * one disk is missing and 'force' is true. */ if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { if (!force) callout_drain(&sc->sc_callout); } else { if (force) { /* * Timeout expired, so destroy device. */ sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } return; } /* * Find the biggest genid. */ genid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid > genid) genid = disk->d_genid; } sc->sc_genid = genid; /* * Remove all disks without the biggest genid. */ for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if (disk->d_genid < genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", g_raid3_get_diskname(disk), sc->sc_name); g_raid3_destroy_disk(disk); } } /* * There must be at least 'sc->sc_ndisks - 1' components * with the same syncid and without SYNCHRONIZING flag. */ /* * Find the biggest syncid, number of valid components and * number of dirty components. */ ndirty = ndisks = syncid = 0; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) ndirty++; if (disk->d_sync.ds_syncid > syncid) { syncid = disk->d_sync.ds_syncid; ndisks = 0; } else if (disk->d_sync.ds_syncid < syncid) { continue; } if ((disk->d_flags & G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { continue; } ndisks++; } /* * Do we have enough valid components? */ if (ndisks + 1 < sc->sc_ndisks) { G_RAID3_DEBUG(0, "Device %s is broken, too few valid components.", sc->sc_name); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } /* * If there is one DIRTY component and all disks are present, * mark it for synchronization. If there is more than one DIRTY * component, mark parity component for synchronization. */ if (ndisks == sc->sc_ndisks && ndirty == 1) { for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { continue; } disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } } else if (ndisks == sc->sc_ndisks && ndirty > 1) { disk = &sc->sc_disks[sc->sc_ndisks - 1]; disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; } sc->sc_syncid = syncid; if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } if (ndisks == sc->sc_ndisks) state = G_RAID3_DEVICE_STATE_COMPLETE; else /* if (ndisks == sc->sc_ndisks - 1) */ state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; for (n = 0; n < sc->sc_ndisks; n++) { disk = &sc->sc_disks[n]; if (disk->d_state == G_RAID3_DISK_STATE_NODISK) continue; state = g_raid3_determine_state(disk); g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); if (state == G_RAID3_DISK_STATE_STALE) sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; } break; } case G_RAID3_DEVICE_STATE_DEGRADED: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (sc->sc_provider != NULL) g_raid3_destroy_provider(sc); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; return; } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks) { state = G_RAID3_DEVICE_STATE_COMPLETE; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; case G_RAID3_DEVICE_STATE_COMPLETE: /* * Genid need to be bumped immediately, so do it here. */ if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; g_raid3_bump_genid(sc); } if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) return; KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= sc->sc_ndisks - 1, ("Too few ACTIVE components in COMPLETE state (device %s).", sc->sc_name)); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == sc->sc_ndisks - 1) { state = G_RAID3_DEVICE_STATE_DEGRADED; G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_device_state2str(state)); sc->sc_state = state; } if (sc->sc_provider == NULL) g_raid3_launch_provider(sc); if (sc->sc_rootmount != NULL) { G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } break; default: KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state))); break; } } /* * Update disk state and device state if needed. */ #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ "Disk %s state changed from %s to %s (device %s).", \ g_raid3_get_diskname(disk), \ g_raid3_disk_state2str(disk->d_state), \ g_raid3_disk_state2str(state), sc->sc_name) static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) { struct g_raid3_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); again: G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), g_raid3_disk_state2str(state)); switch (state) { case G_RAID3_DISK_STATE_NEW: /* * Possible scenarios: * 1. New disk arrive. */ /* Previous state should be NONE. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_state = state; G_RAID3_DEBUG(1, "Device %s: provider %s detected.", sc->sc_name, g_raid3_get_diskname(disk)); if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) break; KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); state = g_raid3_determine_state(disk); if (state != G_RAID3_DISK_STATE_NONE) goto again; break; case G_RAID3_DISK_STATE_ACTIVE: /* * Possible scenarios: * 1. New disk does not need synchronization. * 2. Synchronization process finished successfully. */ KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* Previous state should be NEW or SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; g_raid3_sync_stop(sc, 0); } disk->d_state = state; disk->d_sync.ds_offset = 0; disk->d_sync.ds_offset_done = 0; g_raid3_update_idle(sc, disk); g_raid3_update_metadata(disk); G_RAID3_DEBUG(1, "Device %s: provider %s activated.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_STALE: /* * Possible scenarios: * 1. Stale disk was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * STALE state is only possible if device is marked * NOAUTOSYNC. */ KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; g_raid3_update_metadata(disk); G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", sc->sc_name, g_raid3_get_diskname(disk)); break; case G_RAID3_DISK_STATE_SYNCHRONIZING: /* * Possible scenarios: * 1. Disk which needs synchronization was connected. */ /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); DISK_STATE_CHANGED(); if (disk->d_state == G_RAID3_DISK_STATE_NEW) disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; disk->d_state = state; if (sc->sc_provider != NULL) { g_raid3_sync_start(sc); g_raid3_update_metadata(disk); } break; case G_RAID3_DISK_STATE_DISCONNECTED: /* * Possible scenarios: * 1. Device wasn't running yet, but disk disappear. * 2. Disk was active and disapppear. * 3. Disk disappear during synchronization process. */ if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { /* * Previous state should be ACTIVE, STALE or * SYNCHRONIZING. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || disk->d_state == G_RAID3_DISK_STATE_STALE || disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { /* Previous state should be NEW. */ KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); /* * Reset bumping syncid if disk disappeared in STARTING * state. */ if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; #ifdef INVARIANTS } else { KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, g_raid3_device_state2str(sc->sc_state), g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state))); #endif } DISK_STATE_CHANGED(); G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", sc->sc_name, g_raid3_get_diskname(disk)); g_raid3_destroy_disk(disk); break; default: KASSERT(1 == 0, ("Unknown state (%u).", state)); break; } return (0); } #undef DISK_STATE_CHANGED int g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata are stored on last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = raid3_metadata_decode(buf, md); g_free(buf); if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) return (EINVAL); if (md->md_version > G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } if (error != 0) { G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } if (md->md_sectorsize > MAXPHYS) { G_RAID3_DEBUG(0, "The blocksize is too big."); return (EINVAL); } return (0); } static int g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { if (md->md_no >= sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", pp->name, md->md_no); return (EINVAL); } if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", pp->name, md->md_no); return (EEXIST); } if (md->md_all != sc->sc_ndisks) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_all", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % md->md_sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != " "0) on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if (md->md_mediasize != sc->sc_mediasize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_mediasize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { G_RAID3_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if (md->md_sectorsize != sc->sc_sectorsize) { G_RAID3_DEBUG(1, "Invalid '%s' field on disk %s (device %s), skipping.", "md_sectorsize", pp->name, sc->sc_name); return (EINVAL); } if ((sc->sc_sectorsize % pp->sectorsize) != 0) { G_RAID3_DEBUG(1, "Invalid sector size of disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid device flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { /* * VERIFY and ROUND-ROBIN options are mutally exclusive. */ G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " "disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { G_RAID3_DEBUG(1, "Invalid disk flags on disk %s (device %s), skipping.", pp->name, sc->sc_name); return (EINVAL); } return (0); } int g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, struct g_raid3_metadata *md) { struct g_raid3_disk *disk; int error; g_topology_assert_not(); G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); error = g_raid3_check_metadata(sc, pp, md); if (error != 0) return (error); if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && md->md_genid < sc->sc_genid) { G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } disk = g_raid3_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, G_RAID3_EVENT_WAIT); if (error != 0) return (error); if (md->md_version < G_RAID3_VERSION) { G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", pp->name, md->md_version, G_RAID3_VERSION); g_raid3_update_metadata(disk); } return (0); } static void g_raid3_destroy_delayed(void *arg, int flag) { struct g_raid3_softc *sc; int error; if (flag == EV_CANCEL) { G_RAID3_DEBUG(1, "Destroying canceled."); return; } sc = arg; g_topology_unlock(); sx_xlock(&sc->sc_lock); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0, ("DESTROY flag set on %s.", sc->sc_name)); KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0, ("DESTROYING flag not set on %s.", sc->sc_name)); G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name); error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT); if (error != 0) { G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name); sx_xunlock(&sc->sc_lock); } g_topology_lock(); } static int g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid3_softc *sc; int dcr, dcw, dce, error = 0; g_topology_assert(); G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); sc = pp->geom->softc; if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0) return (0); KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); dcr = pp->acr + acr; dcw = pp->acw + acw; dce = pp->ace + ace; g_topology_unlock(); sx_xlock(&sc->sc_lock); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 || g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { if (acr > 0 || acw > 0 || ace > 0) error = ENXIO; goto end; } if (dcw == 0) g_raid3_idle(sc, dcw); if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) { if (acr > 0 || acw > 0 || ace > 0) { error = ENXIO; goto end; } if (dcr == 0 && dcw == 0 && dce == 0) { g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK, sc, NULL); } } end: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static struct g_geom * g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) { struct g_raid3_softc *sc; struct g_geom *gp; int error, timeout; u_int n; g_topology_assert(); G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* One disk is minimum. */ if (md->md_all < 1) return (NULL); /* * Action geom. */ gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, M_WAITOK | M_ZERO); gp->start = g_raid3_start; gp->orphan = g_raid3_orphan; gp->access = g_raid3_access; gp->dumpconf = g_raid3_dumpconf; sc->sc_id = md->md_id; sc->sc_mediasize = md->md_mediasize; sc->sc_sectorsize = md->md_sectorsize; sc->sc_ndisks = md->md_all; sc->sc_round_robin = 0; sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; sc->sc_writes = 0; for (n = 0; n < sc->sc_ndisks; n++) { sc->sc_disks[n].d_softc = sc; sc->sc_disks[n].d_no = n; sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; } sx_init(&sc->sc_lock, "graid3:lock"); bioq_init(&sc->sc_queue); mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); bioq_init(&sc->sc_regular_delayed); bioq_init(&sc->sc_inflight); bioq_init(&sc->sc_sync_delayed); TAILQ_INIT(&sc->sc_events); mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); callout_init(&sc->sc_callout, 1); sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; /* * Synchronization geom. */ gp = g_new_geomf(mp, "%s.sync", md->md_name); gp->softc = sc; gp->orphan = g_raid3_orphan; sc->sc_sync.ds_geom = gp; if (!g_raid3_use_malloc) { sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; } error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, "g_raid3 %s", md->md_name); if (error != 0) { G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", sc->sc_name); if (!g_raid3_use_malloc) { uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); } g_destroy_geom(sc->sc_sync.ds_geom); mtx_destroy(&sc->sc_events_mtx); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (NULL); } G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).", sc->sc_name, sc->sc_ndisks, sc->sc_id); sc->sc_rootmount = root_mount_hold("GRAID3"); G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); /* * Run timeout. */ timeout = atomic_load_acq_int(&g_raid3_timeout); callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); return (sc->sc_geom); } int g_raid3_destroy(struct g_raid3_softc *sc, int how) { struct g_provider *pp; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { switch (how) { case G_RAID3_DESTROY_SOFT: G_RAID3_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); case G_RAID3_DESTROY_DELAYED: G_RAID3_DEBUG(1, "Device %s will be destroyed on last close.", pp->name); if (sc->sc_syncdisk != NULL) g_raid3_sync_stop(sc, 1); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING; return (EBUSY); case G_RAID3_DESTROY_HARD: G_RAID3_DEBUG(1, "Device %s is still open, so it " "can't be definitely removed.", pp->name); break; } } g_topology_lock(); if (sc->sc_geom->softc == NULL) { g_topology_unlock(); return (0); } sc->sc_geom->softc = NULL; sc->sc_sync.ds_geom->softc = NULL; g_topology_unlock(); sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); sx_xunlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); wakeup(sc); wakeup(&sc->sc_queue); mtx_unlock(&sc->sc_queue_mtx); G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); while (sc->sc_worker != NULL) tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); sx_xlock(&sc->sc_lock); g_raid3_destroy_device(sc); free(sc->sc_disks, M_RAID3); free(sc, M_RAID3); return (0); } static void g_raid3_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_raid3_metadata md; struct g_raid3_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); G_RAID3_DEBUG(2, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "raid3:taste"); /* This orphan function should be never called. */ gp->orphan = g_raid3_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_raid3_read_metadata(cp, &md); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_raid3_debug >= 2) raid3_metadata_dump(&md); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_sync.ds_geom == gp) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) { G_RAID3_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } break; } if (gp == NULL) { gp = g_raid3_create(mp, &md); if (gp == NULL) { G_RAID3_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; } G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); g_topology_unlock(); sx_xlock(&sc->sc_lock); error = g_raid3_add_disk(sc, pp, &md); if (error != 0) { G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == sc->sc_ndisks) { g_cancel_event(sc); g_raid3_destroy(sc, G_RAID3_DESTROY_HARD); g_topology_lock(); return (NULL); } gp = NULL; } sx_xunlock(&sc->sc_lock); g_topology_lock(); return (gp); } static int g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid3_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid3_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { struct g_raid3_disk *disk; disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s", indent); if (disk->d_no == sc->sc_ndisks - 1) sbuf_printf(sb, "PARITY"); else sbuf_printf(sb, "DATA"); sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, (u_int)disk->d_no); if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { sbuf_printf(sb, "%s", indent); if (disk->d_sync.ds_offset == 0) sbuf_printf(sb, "0%%"); else { sbuf_printf(sb, "%u%%", (u_int)((disk->d_sync.ds_offset * 100) / (sc->sc_mediasize / (sc->sc_ndisks - 1)))); } sbuf_printf(sb, "\n"); if (disk->d_sync.ds_offset > 0) { sbuf_printf(sb, "%s%jd" "\n", indent, (intmax_t)disk->d_sync.ds_offset); } } sbuf_printf(sb, "%s%u\n", indent, disk->d_sync.ds_syncid); sbuf_printf(sb, "%s%u\n", indent, disk->d_genid); sbuf_printf(sb, "%s", indent); if (disk->d_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((disk->d_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, "SYNCHRONIZING"); ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%s\n", indent, g_raid3_disk_state2str(disk->d_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (!g_raid3_use_malloc) { sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); sbuf_printf(sb, "%s%u\n", indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); } sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%u\n", indent, sc->sc_syncid); sbuf_printf(sb, "%s%u\n", indent, sc->sc_genid); sbuf_printf(sb, "%s", indent); if (sc->sc_flags == 0) sbuf_printf(sb, "NONE"); else { int first = 1; #define ADD_FLAG(flag, name) do { \ if ((sc->sc_flags & (flag)) != 0) { \ if (!first) \ sbuf_printf(sb, ", "); \ else \ first = 0; \ sbuf_printf(sb, name); \ } \ } while (0) ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, "ROUND-ROBIN"); ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); #undef ADD_FLAG } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%u\n", indent, sc->sc_ndisks); sbuf_printf(sb, "%s%s\n", indent, g_raid3_device_state2str(sc->sc_state)); sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid3_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid3_softc *sc; int error; mp = arg; DROP_GIANT(); g_topology_lock(); g_raid3_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; /* Skip synchronization geom. */ if (gp == sc->sc_sync.ds_geom) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); g_raid3_idle(sc, -1); g_cancel_event(sc); error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); PICKUP_GIANT(); } static void g_raid3_init(struct g_class *mp) { g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid3_post_sync == NULL) G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); } static void g_raid3_fini(struct g_class *mp) { if (g_raid3_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync); } DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); Index: head/sys/sys/bio.h =================================================================== --- head/sys/sys/bio.h (revision 297954) +++ head/sys/sys/bio.h (revision 297955) @@ -1,159 +1,159 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BIO_H_ #define _SYS_BIO_H_ #include /* bio_cmd */ #define BIO_READ 0x01 /* Read I/O data */ #define BIO_WRITE 0x02 /* Write I/O data */ #define BIO_DELETE 0x04 /* TRIM or free blocks, i.e. mark as unused */ #define BIO_GETATTR 0x08 /* Get GEOM attributes of object */ #define BIO_FLUSH 0x10 /* Commit outstanding I/O now */ #define BIO_CMD0 0x20 /* Available for local hacks */ #define BIO_CMD1 0x40 /* Available for local hacks */ #define BIO_CMD2 0x80 /* Available for local hacks */ /* bio_flags */ #define BIO_ERROR 0x01 /* An error occurred processing this bio. */ #define BIO_DONE 0x02 /* This bio is finished. */ #define BIO_ONQUEUE 0x04 /* This bio is in a queue & not yet taken. */ /* * This bio must be executed after all previous bios in the queue have been * executed, and before any successive bios can be executed. */ #define BIO_ORDERED 0x08 #define BIO_UNMAPPED 0x10 #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 #ifdef _KERNEL struct disk; struct bio; struct vm_map; /* Empty classifier tag, to prevent further classification. */ #define BIO_NOTCLASSIFIED (void *)(~0UL) typedef void bio_task_t(void *); /* * The bio structure describes an I/O operation in the kernel. */ struct bio { - uint8_t bio_cmd; /* I/O operation. */ - uint8_t bio_flags; /* General flags. */ - uint8_t bio_cflags; /* Private use by the consumer. */ - uint8_t bio_pflags; /* Private use by the provider. */ + uint16_t bio_cmd; /* I/O operation. */ + uint16_t bio_flags; /* General flags. */ + uint16_t bio_cflags; /* Private use by the consumer. */ + uint16_t bio_pflags; /* Private use by the provider. */ struct cdev *bio_dev; /* Device to do I/O on. */ struct disk *bio_disk; /* Valid below geom_disk.c only */ off_t bio_offset; /* Offset into file. */ long bio_bcount; /* Valid bytes in buffer. */ caddr_t bio_data; /* Memory, superblocks, indirect etc. */ struct vm_page **bio_ma; /* Or unmapped. */ int bio_ma_offset; /* Offset in the first page of bio_ma. */ int bio_ma_n; /* Number of pages in bio_ma. */ int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); void *bio_driver1; /* Private use by the provider. */ void *bio_driver2; /* Private use by the provider. */ void *bio_caller1; /* Private use by the consumer. */ void *bio_caller2; /* Private use by the consumer. */ TAILQ_ENTRY(bio) bio_queue; /* Disksort queue. */ const char *bio_attribute; /* Attribute for BIO_[GS]ETATTR */ struct g_consumer *bio_from; /* GEOM linkage */ struct g_provider *bio_to; /* GEOM linkage */ off_t bio_length; /* Like bio_bcount */ off_t bio_completed; /* Inverse of bio_resid */ u_int bio_children; /* Number of spawned bios */ u_int bio_inbed; /* Children safely home by now */ struct bio *bio_parent; /* Pointer to parent */ struct bintime bio_t0; /* Time request started */ bio_task_t *bio_task; /* Task_queue handler */ void *bio_task_arg; /* Argument to above */ void *bio_classifier1; /* Classifier tag. */ void *bio_classifier2; /* Classifier tag. */ #ifdef DIAGNOSTIC void *_bio_caller1; void *_bio_caller2; uint8_t _bio_cflags; #endif /* XXX: these go away when bio chaining is introduced */ daddr_t bio_pblkno; /* physical block number */ }; struct uio; struct devstat; struct bio_queue_head { TAILQ_HEAD(bio_queue, bio) queue; off_t last_offset; struct bio *insert_point; }; extern struct vm_map *bio_transient_map; extern int bio_transient_maxcnt; void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); void bioq_disksort(struct bio_queue_head *ap, struct bio *bp); struct bio *bioq_first(struct bio_queue_head *head); struct bio *bioq_takefirst(struct bio_queue_head *head); void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error); void bioq_init(struct bio_queue_head *head); void bioq_insert_head(struct bio_queue_head *head, struct bio *bp); void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp); void bioq_remove(struct bio_queue_head *head, struct bio *bp); void bio_taskqueue(struct bio *bp, bio_task_t *fund, void *arg); int physio(struct cdev *dev, struct uio *uio, int ioflag); #define physread physio #define physwrite physio #endif /* _KERNEL */ #endif /* !_SYS_BIO_H_ */ Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 297954) +++ head/sys/sys/buf.h (revision 297955) @@ -1,552 +1,552 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; typedef unsigned char b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning bufobj lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; - uint8_t b_iocmd; /* BIO_* bio_cmd from bio.h */ - uint8_t b_ioflags; /* BIO_* bio_flags from bio.h */ + uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */ + uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */ off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ unsigned short b_qindex; /* (Q) buffer queue index */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ int b_runningbufspace; /* when I/O is running, pipelining */ int b_kvasize; /* size of kva for buffer */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ caddr_t b_kvabase; /* base kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ union { TAILQ_ENTRY(buf) b_freelist; /* (Q) */ struct { void (*b_pgiodone)(void *, vm_page_t *, int, int); int b_pgbefore; int b_pgafter; }; }; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; int b_pin_count; }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceeding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_00040000 0x00040000 /* Available flag. */ #define B_00080000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ #define B_DIRTY 0x00200000 /* Needs writing later (in EXT2FS). */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ #define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26dirty\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15b12\14noreuse\13eintr\12done\11persist\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\2clean\1dirty" #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #define BV_BKGRDERR 0x00000008 /* Error from background write */ #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL /* * Buffer locking */ extern const char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ #define BUF_LOCK(bp, locktype, interlock) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE) /* * Get a lock sleeping with specified interruptably and timeout. */ #define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ LOCK_FILE, LOCK_LINE) /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ #define BUF_UNLOCK(bp) do { \ KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ \ (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE); \ } while (0) /* * Check if a buffer lock is recursed. */ #define BUF_LOCKRECURSED(bp) \ lockmgr_recursed(&(bp)->b_lock) /* * Check if a buffer lock is currently held. */ #define BUF_ISLOCKED(bp) \ lockstatus(&(bp)->b_lock) /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ lockdestroy(&(bp)->b_lock) /* * Print informations on a buffer lock. */ #define BUF_LOCKPRINTINFO(bp) \ lockmgr_printinfo(&(bp)->b_lock) /* * Buffer lock assertions. */ #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) #define BUF_ASSERT_LOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_SLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_XLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ #define BUF_KERNPROC(bp) \ _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif /* * Find out if the lock has waiters or not. */ #define BUF_LOCKWAITERS(bp) \ lockmgr_waiters(&(bp)->b_lock) #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ #define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern long maxswzone; /* Max KVA for swap structures */ extern long maxbcache; /* Max KVA for buffer cache */ extern long runningbufspace; extern long hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int buf_mapped(struct buf *bp) { return (bp->b_data != unmapped_buf); } void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); void bufshutdown(int); void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ gbflags, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, int, struct buf **); void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); void bdwrite(struct buf *); void bawrite(struct buf *); void babarrierwrite(struct buf *); int bbarrierwrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); void bpin(struct buf *); void bunpin(struct buf *); void bunpin_wait(struct buf *); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */