Index: sys/geom/mirror/g_mirror.h =================================================================== --- sys/geom/mirror/g_mirror.h +++ sys/geom/mirror/g_mirror.h @@ -148,6 +148,10 @@ u_int d_genid; /* Disk's generation ID. */ struct g_mirror_disk_sync d_sync;/* Sync information. */ LIST_ENTRY(g_mirror_disk) d_next; + u_int d_init_ndisks; /* Initial number of mirror components */ + uint32_t d_init_slice; /* Initial slice size */ + uint8_t d_init_balance;/* Initial balance */ + uint64_t d_init_mediasize;/* Initial mediasize */ }; #define d_name d_consumer->provider->name Index: sys/geom/mirror/g_mirror.c =================================================================== --- sys/geom/mirror/g_mirror.c +++ sys/geom/mirror/g_mirror.c @@ -59,6 +59,11 @@ int g_mirror_debug = 0; SYSCTL_INT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0, "Debug level"); +bool g_launch_mirror_before_timeout = true; +SYSCTL_BOOL(_kern_geom_mirror, OID_AUTO, launch_mirror_before_timeout, + CTLFLAG_RWTUN, &g_launch_mirror_before_timeout, 0, + "If false, force gmirror to wait out the full kern.geom.mirror.timeout " + "before launching mirrors"); static u_int g_mirror_timeout = 4; SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout, 0, "Time to wait on all mirror components"); @@ -110,6 +115,8 @@ static void g_mirror_update_device(struct g_mirror_softc *sc, bool force); static void g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); +static int g_mirror_refresh_device(struct g_mirror_softc *sc, + const struct g_provider *pp, const struct g_mirror_metadata *md); static void g_mirror_sync_reinit(const struct g_mirror_disk *disk, struct bio *bp, off_t offset); static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type); @@ -472,6 +479,10 @@ disk->d_sync.ds_update_ts = time_uptime; disk->d_genid = md->md_genid; disk->d_sync.ds_syncid = md->md_syncid; + disk->d_init_ndisks = md->md_all; + disk->d_init_slice = md->md_slice; + disk->d_init_balance = md->md_balance; + disk->d_init_mediasize = md->md_mediasize; if (errorp != NULL) *errorp = 0; return (disk); @@ -2362,17 +2373,74 @@ case G_MIRROR_DEVICE_STATE_STARTING: { struct g_mirror_disk *pdisk, *tdisk; - u_int dirty, ndisks, genid, syncid; - bool broken; + const char *mismatch; + uintmax_t found, newest; + u_int dirty, ndisks; + + /* Pre-flight checks */ + LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { + /* + * Confirm we already detected the newest genid. + */ + KASSERT(sc->sc_genid >= disk->d_genid, + ("%s: found newer genid %u (sc:%p had %u).", __func__, + disk->d_genid, sc, sc->sc_genid)); + + /* Kick out any previously tasted stale components. */ + if (disk->d_genid < sc->sc_genid) { + G_MIRROR_DEBUG(0, "Stale 'genid' field on %s " + "(device %s) (component=%u latest=%u), skipping.", + g_mirror_get_diskname(disk), sc->sc_name, + disk->d_genid, sc->sc_genid); + g_mirror_destroy_disk(disk); + sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; + continue; + } + + /* + * Confirm we already detected the newest syncid. + */ + KASSERT(sc->sc_syncid >= disk->d_sync.ds_syncid, + ("%s: found newer syncid %u (sc:%p had %u).", + __func__, disk->d_sync.ds_syncid, sc, + sc->sc_syncid)); + +#define DETECT_MISMATCH(field, name) \ + if (mismatch == NULL && \ + disk->d_init_ ## field != sc->sc_ ## field) { \ + mismatch = name; \ + found = (intmax_t)disk->d_init_ ## field; \ + newest = (intmax_t)sc->sc_ ## field; \ + } + mismatch = NULL; + DETECT_MISMATCH(ndisks, "md_all"); + DETECT_MISMATCH(balance, "md_balance"); + DETECT_MISMATCH(slice, "md_slice"); + DETECT_MISMATCH(mediasize, "md_mediasize"); +#undef DETECT_MISMATCH + if (mismatch != NULL) { + G_MIRROR_DEBUG(0, "Found a mismatching '%s' " + "field on %s (device %s) (found=%ju " + "newest=%ju).", mismatch, + g_mirror_get_diskname(disk), sc->sc_name, + found, newest); + g_mirror_destroy_disk(disk); + sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; + continue; + } + } KASSERT(sc->sc_provider == NULL, ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); /* - * Are we ready? We are, if all disks are connected or - * if we have any disks and 'force' is true. + * Are we ready? If the timeout (force is true) has expired, and + * any disks are present, then yes. If we're permitted to launch + * before the timeout has expired and the expected number of + * current-generation mirror disks have been tasted, then yes. */ ndisks = g_mirror_ndisks(sc, -1); - if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) { + if ((force && ndisks > 0) || + (g_launch_mirror_before_timeout && ndisks == sc->sc_ndisks)) { ; } else if (ndisks == 0) { /* @@ -2419,42 +2487,6 @@ callout_drain(&sc->sc_callout); } - /* - * Find the biggest genid. - */ - genid = 0; - LIST_FOREACH(disk, &sc->sc_disks, d_next) { - if (disk->d_genid > genid) - genid = disk->d_genid; - } - sc->sc_genid = genid; - /* - * Remove all disks without the biggest genid. - */ - broken = false; - LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { - if (disk->d_genid < genid) { - G_MIRROR_DEBUG(0, - "Component %s (device %s) broken, skipping.", - g_mirror_get_diskname(disk), sc->sc_name); - g_mirror_destroy_disk(disk); - /* - * Bump the syncid in case we discover a healthy - * replacement disk after starting the mirror. - */ - broken = true; - } - } - - /* - * Find the biggest syncid. - */ - syncid = 0; - LIST_FOREACH(disk, &sc->sc_disks, d_next) { - if (disk->d_sync.ds_syncid > syncid) - syncid = disk->d_sync.ds_syncid; - } - /* * Here we need to look for dirty disks and if all disks * with the biggest syncid are dirty, we have to choose @@ -2468,7 +2500,7 @@ dirty = ndisks = 0; pdisk = NULL; LIST_FOREACH(disk, &sc->sc_disks, d_next) { - if (disk->d_sync.ds_syncid != syncid) + if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { @@ -2495,7 +2527,7 @@ "master disk for synchronization.", g_mirror_get_diskname(pdisk), sc->sc_name); LIST_FOREACH(disk, &sc->sc_disks, d_next) { - if (disk->d_sync.ds_syncid != syncid) + if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { @@ -2516,7 +2548,7 @@ * We have some non-dirty disks. */ LIST_FOREACH(disk, &sc->sc_disks, d_next) { - if (disk->d_sync.ds_syncid != syncid) + if (disk->d_sync.ds_syncid != sc->sc_syncid) continue; if ((disk->d_flags & G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) { @@ -2532,8 +2564,7 @@ /* Reset hint. */ sc->sc_hint = NULL; - sc->sc_syncid = syncid; - if (force || broken) { + if (force) { /* Remember to bump syncid on first write. */ sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID; } @@ -2870,37 +2901,23 @@ struct g_mirror_metadata *md) { + G_MIRROR_DEBUG(2, "%s: md_did 0x%u disk %s device %s md_all 0x%x " + "sc_ndisks 0x%x md_slice 0x%x sc_slice 0x%x md_balance 0x%x " + "sc_balance 0x%x sc_mediasize 0x%lx pp_mediasize 0x%lx " + "md_sectorsize 0x%x sc_sectorsize 0x%x md_mflags 0x%lx " + "md_dflags 0x%lx md_syncid 0x%x md_genid 0x%x md_priority 0x%x " + "sc_state 0x%x.", + __func__, md->md_did, pp->name, sc->sc_name, md->md_all, + sc->sc_ndisks, md->md_slice, sc->sc_slice, md->md_balance, + sc->sc_balance, sc->sc_mediasize, pp->mediasize, md->md_sectorsize, + sc->sc_sectorsize, md->md_mflags, md->md_dflags, md->md_syncid, + md->md_genid, md->md_priority, sc->sc_state); + if (g_mirror_id2disk(sc, md->md_did) != NULL) { G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.", pp->name, md->md_did); return (EEXIST); } - if (md->md_all != sc->sc_ndisks) { - G_MIRROR_DEBUG(1, - "Invalid '%s' field on disk %s (device %s), skipping.", - "md_all", pp->name, sc->sc_name); - return (EINVAL); - } - if (md->md_slice != sc->sc_slice) { - G_MIRROR_DEBUG(1, - "Invalid '%s' field on disk %s (device %s), skipping.", - "md_slice", pp->name, sc->sc_name); - return (EINVAL); - } - if (md->md_balance != sc->sc_balance) { - G_MIRROR_DEBUG(1, - "Invalid '%s' field on disk %s (device %s), skipping.", - "md_balance", pp->name, sc->sc_name); - return (EINVAL); - } -#if 0 - if (md->md_mediasize != sc->sc_mediasize) { - G_MIRROR_DEBUG(1, - "Invalid '%s' field on disk %s (device %s), skipping.", - "md_mediasize", pp->name, sc->sc_name); - return (EINVAL); - } -#endif if (sc->sc_mediasize > pp->mediasize) { G_MIRROR_DEBUG(1, "Invalid size of disk %s (device %s), skipping.", pp->name, @@ -2947,12 +2964,21 @@ error = g_mirror_check_metadata(sc, pp, md); if (error != 0) return (error); - if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING && - md->md_genid < sc->sc_genid) { + + if (md->md_genid < sc->sc_genid) { G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.", pp->name, sc->sc_name); return (EINVAL); } + + /* + * If the component disk we're tasting has newer metadata than the + * STARTING gmirror device, refresh the device from the component. + */ + error = g_mirror_refresh_device(sc, pp, md); + if (error != 0) + return (error); + disk = g_mirror_init_disk(sc, pp, md, &error); if (disk == NULL) return (error); @@ -3029,6 +3055,21 @@ return (error); } +static void +g_mirror_reinit_from_metadata(struct g_mirror_softc *sc, + const struct g_mirror_metadata *md) +{ + + sc->sc_genid = md->md_genid; + sc->sc_syncid = md->md_syncid; + + sc->sc_slice = md->md_slice; + sc->sc_balance = md->md_balance; + sc->sc_mediasize = md->md_mediasize; + sc->sc_ndisks = md->md_all; + sc->sc_flags = md->md_mflags; +} + struct g_geom * g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md, u_int type) @@ -3056,12 +3097,8 @@ sc->sc_type = type; sc->sc_id = md->md_mid; - sc->sc_slice = md->md_slice; - sc->sc_balance = md->md_balance; - sc->sc_mediasize = md->md_mediasize; + g_mirror_reinit_from_metadata(sc, md); sc->sc_sectorsize = md->md_sectorsize; - sc->sc_ndisks = md->md_all; - sc->sc_flags = md->md_mflags; sc->sc_bump_id = 0; sc->sc_idle = 1; sc->sc_last_write = time_uptime; @@ -3484,5 +3521,51 @@ EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync); } +/* + * Refresh the mirror device's metadata when gmirror encounters a newer + * generation as the individual components are being added to the mirror set. + */ +static int +g_mirror_refresh_device(struct g_mirror_softc *sc, const struct g_provider *pp, + const struct g_mirror_metadata *md) +{ + + g_topology_assert_not(); + sx_assert(&sc->sc_lock, SX_XLOCKED); + + KASSERT(sc->sc_genid <= md->md_genid, + ("%s: attempted to refresh from stale component %s (device %s) " + "(%u < %u).", __func__, pp->name, sc->sc_name, md->md_genid, + sc->sc_genid)); + + if (sc->sc_genid > md->md_genid || (sc->sc_genid == md->md_genid && + sc->sc_syncid >= md->md_syncid)) + return (0); + + G_MIRROR_DEBUG(0, "Found newer version for device %s (genid: curr=%u " + "new=%u; syncid: curr=%u new=%u; ndisks: curr=%u new=%u; " + "provider=%s).", sc->sc_name, sc->sc_genid, md->md_genid, + sc->sc_syncid, md->md_syncid, sc->sc_ndisks, md->md_all, pp->name); + + if (sc->sc_state != G_MIRROR_DEVICE_STATE_STARTING) { + /* Probable data corruption detected */ + G_MIRROR_DEBUG(0, "Cannot refresh metadata in %s state " + "(device=%s genid=%u). A stale mirror device was launched.", + g_mirror_device_state2str(sc->sc_state), sc->sc_name, + sc->sc_genid); + return (EINVAL); + } + + /* Update softc */ + g_mirror_reinit_from_metadata(sc, md); + + G_MIRROR_DEBUG(1, "Refresh device %s (id=%u, state=%s) from disk %s " + "(genid=%u syncid=%u md_all=%u).", sc->sc_name, md->md_mid, + g_mirror_device_state2str(sc->sc_state), pp->name, md->md_genid, + md->md_syncid, (unsigned)md->md_all); + + return (0); +} + DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); MODULE_VERSION(geom_mirror, 0); Index: tests/sys/geom/class/mirror/Makefile =================================================================== --- tests/sys/geom/class/mirror/Makefile +++ tests/sys/geom/class/mirror/Makefile @@ -18,6 +18,7 @@ TAP_TESTS_SH+= 12_test TAP_TESTS_SH+= 13_test +ATF_TESTS_SH+= component_selection ATF_TESTS_SH+= sync_error ${PACKAGE}FILES+= conf.sh Index: tests/sys/geom/class/mirror/component_selection.sh =================================================================== --- /dev/null +++ tests/sys/geom/class/mirror/component_selection.sh @@ -0,0 +1,116 @@ +# $FreeBSD$ + +atf_test_case run_latest_genid cleanup +run_latest_genid_head() +{ + atf_set "descr" \ + "Ensure that we properly select components (latest genid) during STARTING." + atf_set "require.user" "root" +} +run_latest_genid_body() +{ + . $(atf_get_srcdir)/conf.sh + + f1=$(mktemp ${base}.XXXXXX) + f2=$(mktemp ${base}.XXXXXX) + f3=$(mktemp ${base}.XXXXXX) + rnd1=$(mktemp ${base}.XXXXXX) + rnd2=$(mktemp ${base}.XXXXXX) + + atf_check truncate -s 2M $f1 + atf_check truncate -s 2M $f2 + atf_check truncate -s 2M $f3 + dd if=/dev/urandom bs=512 count=1 of="$rnd1" + dd if=/dev/urandom bs=512 count=1 of="$rnd2" + + md1=$(attach_md -t vnode -f ${f1}) + md2=$(attach_md -t vnode -f ${f2}) + md3=$(attach_md -t vnode -f ${f3}) + + # Use offset 512 so the underlying md device isn't tasted + # instead of the nop device. Use a gnop for md1 so it has + # matching size with the two manipulated components. + atf_check gnop create -o 512 $md1 + atf_check gnop create -o 512 $md2 + atf_check gnop create -o 512 $md3 + atf_check gmirror label $name ${md1}.nop + devwait + + atf_check gmirror insert $name ${md2}.nop + atf_check gmirror insert $name ${md3}.nop + syncwait + + sleep 0.5 + + # Fail mirror 3, writing known contents to mirror 1+2 block 1 + atf_check -s exit:0 -e empty -o empty \ + gnop configure -w 100 ${md3}.nop + atf_check -s exit:0 dd if="$rnd1" bs=512 count=1 oseek=1 conv=notrunc \ + of=/dev/mirror/$name status=none + + sleep 0.2 + + # Should have two mirrors remaining after md3 was evicted + atf_check [ $(gmirror status -s $name | wc -l) -eq 2 ] + atf_check -s exit:0 -o match:"DEGRADED ${md1}.nop \(ACTIVE\)" \ + gmirror status -s $name + atf_check -s exit:0 -o match:"DEGRADED ${md2}.nop \(ACTIVE\)" \ + gmirror status -s $name + + sleep 0.5 + + # Repeat: + # Fail mirror 2, writing known contents to mirror 1 block 2 + atf_check -s exit:0 -e empty -o empty \ + gnop configure -w 100 ${md2}.nop + atf_check -s exit:0 dd if="$rnd2" bs=512 count=2 oseek=1 conv=notrunc \ + of=/dev/mirror/$name status=none + + sleep 0.2 + + # Should have one mirror remaining after md2 was evicted + atf_check [ $(gmirror status -s $name | wc -l) -eq 1 ] + atf_check -s exit:0 -o match:"DEGRADED ${md1}.nop \(ACTIVE\)" \ + gmirror status -s $name + + # Stop the mirror and remove the pieces so gmirror can't see them. + atf_check gmirror stop $name + atf_check gnop destroy ${md1}.nop + atf_check gnop destroy ${md2}.nop + atf_check gnop destroy ${md3}.nop + + # Rebuild; spin up "disk" with lowest genid + atf_check gnop create -o 512 $md3 + + # Adding newer genid should kick out old component + atf_check gnop create -o 512 $md2 + sleep 0.1 + # Can't test this because 'status' doesn't exist until RUNNING: + #atf_check [ $(gmirror status -s $name | wc -l) -eq 1 ] + + # ditto + atf_check gnop create -o 512 $md1 + sleep 0.1 + sleep $(sysctl -n kern.geom.mirror.timeout) + + atf_check [ $(gmirror status -s $name | wc -l) -eq 1 ] + atf_check -s exit:0 -o match:"DEGRADED ${md1}.nop \(ACTIVE\)" \ + gmirror status -s $name +} +run_latest_genid_cleanup() +{ + . $(atf_get_srcdir)/conf.sh + + if [ -f "$TEST_MDS_FILE" ]; then + while read test_md; do + echo "# Removing test gnop: ${test_md}.nop" + gnop destroy -f "${test_md}.nop" 2>/dev/null || : + done < "$TEST_MDS_FILE" + fi + gmirror_test_cleanup +} + +atf_init_test_cases() +{ + atf_add_test_case run_latest_genid +}