Index: stable/8/sys/geom/raid/g_raid.c =================================================================== --- stable/8/sys/geom/raid/g_raid.c (revision 243678) +++ stable/8/sys/geom/raid/g_raid.c (revision 243679) @@ -1,2501 +1,2556 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_raid_md_if.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff"); int g_raid_enable = 1; TUNABLE_INT("kern.geom.raid.enable", &g_raid_enable); SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RW, &g_raid_enable, 0, "Enable on-disk metadata taste"); u_int g_raid_aggressive_spare = 0; TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW, &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); u_int g_raid_debug = 0; TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0, "Debug level"); int g_raid_read_err_thresh = 10; TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW, &g_raid_read_err_thresh, 0, "Number of read errors equated to disk failure"); u_int g_raid_start_timeout = 30; TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW, &g_raid_start_timeout, 0, "Time to wait for all array components"); static u_int g_raid_clean_time = 5; TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW, &g_raid_clean_time, 0, "Mark volume as clean when idling"); static u_int g_raid_disconnect_on_failure = 1; TUNABLE_INT("kern.geom.raid.disconnect_on_failure", &g_raid_disconnect_on_failure); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid_name_format = 0; TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW, &g_raid_name_format, 0, "Providers name format."); static u_int g_raid_idle_threshold = 1000000; TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold); SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW, &g_raid_idle_threshold, 1000000, "Time in microseconds to consider a volume idle."); #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) LIST_HEAD(, g_raid_md_class) g_raid_md_classes = LIST_HEAD_INITIALIZER(g_raid_md_classes); LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = LIST_HEAD_INITIALIZER(g_raid_tr_classes); LIST_HEAD(, g_raid_volume) g_raid_volumes = LIST_HEAD_INITIALIZER(g_raid_volumes); static eventhandler_tag g_raid_post_sync = NULL; static int g_raid_started = 0; static int g_raid_shutdown = 0; static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid_taste; static void g_raid_init(struct g_class *mp); static void g_raid_fini(struct g_class *mp); struct g_class g_raid_class = { .name = G_RAID_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid_ctl, .taste = g_raid_taste, .destroy_geom = g_raid_destroy_geom, .init = g_raid_init, .fini = g_raid_fini }; static void g_raid_destroy_provider(struct g_raid_volume *vol); static int g_raid_update_disk(struct g_raid_disk *disk, u_int event); static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event); static int g_raid_update_volume(struct g_raid_volume *vol, u_int event); static int g_raid_update_node(struct g_raid_softc *sc, u_int event); static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid_start(struct bio *bp); static void g_raid_start_request(struct bio *bp); static void g_raid_disk_done(struct bio *bp); static void g_raid_poll(struct g_raid_softc *sc); static const char * g_raid_node_event2str(int event) { switch (event) { case G_RAID_NODE_E_WAKE: return ("WAKE"); case G_RAID_NODE_E_START: return ("START"); default: return ("INVALID"); } } const char * g_raid_disk_state2str(int state) { switch (state) { case G_RAID_DISK_S_NONE: return ("NONE"); case G_RAID_DISK_S_OFFLINE: return ("OFFLINE"); case G_RAID_DISK_S_FAILED: return ("FAILED"); case G_RAID_DISK_S_STALE_FAILED: return ("STALE_FAILED"); case G_RAID_DISK_S_SPARE: return ("SPARE"); case G_RAID_DISK_S_STALE: return ("STALE"); case G_RAID_DISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_disk_event2str(int event) { switch (event) { case G_RAID_DISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_subdisk_state2str(int state) { switch (state) { case G_RAID_SUBDISK_S_NONE: return ("NONE"); case G_RAID_SUBDISK_S_FAILED: return ("FAILED"); case G_RAID_SUBDISK_S_NEW: return ("NEW"); case G_RAID_SUBDISK_S_REBUILD: return ("REBUILD"); case G_RAID_SUBDISK_S_UNINITIALIZED: return ("UNINITIALIZED"); case G_RAID_SUBDISK_S_STALE: return ("STALE"); case G_RAID_SUBDISK_S_RESYNC: return ("RESYNC"); case G_RAID_SUBDISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_subdisk_event2str(int event) { switch (event) { case G_RAID_SUBDISK_E_NEW: return ("NEW"); case G_RAID_SUBDISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_volume_state2str(int state) { switch (state) { case G_RAID_VOLUME_S_STARTING: return ("STARTING"); case G_RAID_VOLUME_S_BROKEN: return ("BROKEN"); case G_RAID_VOLUME_S_DEGRADED: return ("DEGRADED"); case G_RAID_VOLUME_S_SUBOPTIMAL: return ("SUBOPTIMAL"); case G_RAID_VOLUME_S_OPTIMAL: return ("OPTIMAL"); case G_RAID_VOLUME_S_UNSUPPORTED: return ("UNSUPPORTED"); case G_RAID_VOLUME_S_STOPPED: return ("STOPPED"); default: return ("INVALID"); } } static const char * g_raid_volume_event2str(int event) { switch (event) { case G_RAID_VOLUME_E_UP: return ("UP"); case G_RAID_VOLUME_E_DOWN: return ("DOWN"); case G_RAID_VOLUME_E_START: return ("START"); case G_RAID_VOLUME_E_STARTMD: return ("STARTMD"); default: return ("INVALID"); } } const char * g_raid_volume_level2str(int level, int qual) { switch (level) { case G_RAID_VOLUME_RL_RAID0: return ("RAID0"); case G_RAID_VOLUME_RL_RAID1: return ("RAID1"); case G_RAID_VOLUME_RL_RAID3: if (qual == G_RAID_VOLUME_RLQ_R3P0) return ("RAID3-P0"); if (qual == G_RAID_VOLUME_RLQ_R3PN) return ("RAID3-PN"); return ("RAID3"); case G_RAID_VOLUME_RL_RAID4: if (qual == G_RAID_VOLUME_RLQ_R4P0) return ("RAID4-P0"); if (qual == G_RAID_VOLUME_RLQ_R4PN) return ("RAID4-PN"); return ("RAID4"); case G_RAID_VOLUME_RL_RAID5: if (qual == G_RAID_VOLUME_RLQ_R5RA) return ("RAID5-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RS) return ("RAID5-RS"); if (qual == G_RAID_VOLUME_RLQ_R5LA) return ("RAID5-LA"); if (qual == G_RAID_VOLUME_RLQ_R5LS) return ("RAID5-LS"); return ("RAID5"); case G_RAID_VOLUME_RL_RAID6: if (qual == G_RAID_VOLUME_RLQ_R6RA) return ("RAID6-RA"); if (qual == G_RAID_VOLUME_RLQ_R6RS) return ("RAID6-RS"); if (qual == G_RAID_VOLUME_RLQ_R6LA) return ("RAID6-LA"); if (qual == G_RAID_VOLUME_RLQ_R6LS) return ("RAID6-LS"); return ("RAID6"); case G_RAID_VOLUME_RL_RAIDMDF: if (qual == G_RAID_VOLUME_RLQ_RMDFRA) return ("RAIDMDF-RA"); if (qual == G_RAID_VOLUME_RLQ_RMDFRS) return ("RAIDMDF-RS"); if (qual == G_RAID_VOLUME_RLQ_RMDFLA) return ("RAIDMDF-LA"); if (qual == G_RAID_VOLUME_RLQ_RMDFLS) return ("RAIDMDF-LS"); return ("RAIDMDF"); case G_RAID_VOLUME_RL_RAID1E: if (qual == G_RAID_VOLUME_RLQ_R1EA) return ("RAID1E-A"); if (qual == G_RAID_VOLUME_RLQ_R1EO) return ("RAID1E-O"); return ("RAID1E"); case G_RAID_VOLUME_RL_SINGLE: return ("SINGLE"); case G_RAID_VOLUME_RL_CONCAT: return ("CONCAT"); case G_RAID_VOLUME_RL_RAID5E: if (qual == G_RAID_VOLUME_RLQ_R5ERA) return ("RAID5E-RA"); if (qual == G_RAID_VOLUME_RLQ_R5ERS) return ("RAID5E-RS"); if (qual == G_RAID_VOLUME_RLQ_R5ELA) return ("RAID5E-LA"); if (qual == G_RAID_VOLUME_RLQ_R5ELS) return ("RAID5E-LS"); return ("RAID5E"); case G_RAID_VOLUME_RL_RAID5EE: if (qual == G_RAID_VOLUME_RLQ_R5EERA) return ("RAID5EE-RA"); if (qual == G_RAID_VOLUME_RLQ_R5EERS) return ("RAID5EE-RS"); if (qual == G_RAID_VOLUME_RLQ_R5EELA) return ("RAID5EE-LA"); if (qual == G_RAID_VOLUME_RLQ_R5EELS) return ("RAID5EE-LS"); return ("RAID5EE"); case G_RAID_VOLUME_RL_RAID5R: if (qual == G_RAID_VOLUME_RLQ_R5RRA) return ("RAID5R-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RRS) return ("RAID5R-RS"); if (qual == G_RAID_VOLUME_RLQ_R5RLA) return ("RAID5R-LA"); if (qual == G_RAID_VOLUME_RLQ_R5RLS) return ("RAID5R-LS"); return ("RAID5E"); default: return ("UNKNOWN"); } } int g_raid_volume_str2level(const char *str, int *level, int *qual) { *level = G_RAID_VOLUME_RL_UNKNOWN; *qual = G_RAID_VOLUME_RLQ_NONE; if (strcasecmp(str, "RAID0") == 0) *level = G_RAID_VOLUME_RL_RAID0; else if (strcasecmp(str, "RAID1") == 0) *level = G_RAID_VOLUME_RL_RAID1; else if (strcasecmp(str, "RAID3-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3P0; } else if (strcasecmp(str, "RAID3-PN") == 0 || strcasecmp(str, "RAID3") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3PN; } else if (strcasecmp(str, "RAID4-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4P0; } else if (strcasecmp(str, "RAID4-PN") == 0 || strcasecmp(str, "RAID4") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4PN; } else if (strcasecmp(str, "RAID5-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RA; } else if (strcasecmp(str, "RAID5-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RS; } else if (strcasecmp(str, "RAID5") == 0 || strcasecmp(str, "RAID5-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LA; } else if (strcasecmp(str, "RAID5-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LS; } else if (strcasecmp(str, "RAID6-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RA; } else if (strcasecmp(str, "RAID6-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RS; } else if (strcasecmp(str, "RAID6") == 0 || strcasecmp(str, "RAID6-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LA; } else if (strcasecmp(str, "RAID6-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LS; } else if (strcasecmp(str, "RAIDMDF-RA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRA; } else if (strcasecmp(str, "RAIDMDF-RS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRS; } else if (strcasecmp(str, "RAIDMDF") == 0 || strcasecmp(str, "RAIDMDF-LA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLA; } else if (strcasecmp(str, "RAIDMDF-LS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLS; } else if (strcasecmp(str, "RAID10") == 0 || strcasecmp(str, "RAID1E") == 0 || strcasecmp(str, "RAID1E-A") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EA; } else if (strcasecmp(str, "RAID1E-O") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EO; } else if (strcasecmp(str, "SINGLE") == 0) *level = G_RAID_VOLUME_RL_SINGLE; else if (strcasecmp(str, "CONCAT") == 0) *level = G_RAID_VOLUME_RL_CONCAT; else if (strcasecmp(str, "RAID5E-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERA; } else if (strcasecmp(str, "RAID5E-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERS; } else if (strcasecmp(str, "RAID5E") == 0 || strcasecmp(str, "RAID5E-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELA; } else if (strcasecmp(str, "RAID5E-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELS; } else if (strcasecmp(str, "RAID5EE-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERA; } else if (strcasecmp(str, "RAID5EE-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERS; } else if (strcasecmp(str, "RAID5EE") == 0 || strcasecmp(str, "RAID5EE-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELA; } else if (strcasecmp(str, "RAID5EE-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELS; } else if (strcasecmp(str, "RAID5R-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRA; } else if (strcasecmp(str, "RAID5R-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRS; } else if (strcasecmp(str, "RAID5R") == 0 || strcasecmp(str, "RAID5R-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLA; } else if (strcasecmp(str, "RAID5R-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLS; } else return (-1); return (0); } const char * g_raid_get_diskname(struct g_raid_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_consumer->provider->name); } void +g_raid_get_disk_info(struct g_raid_disk *disk) +{ + struct g_consumer *cp = disk->d_consumer; + int error, len; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (error) + disk->d_kd.di.dumper = NULL; + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, disk->d_softc, + "Dumping not supported by %s: %d.", + cp->provider->name, error); + + /* Read BIO_DELETE support. */ + error = g_getattr("GEOM::candelete", cp, &disk->d_candelete); + if (error) + disk->d_candelete = 0; + if (!disk->d_candelete) + G_RAID_DEBUG1(2, disk->d_softc, + "BIO_DELETE not supported by %s: %d.", + cp->provider->name, error); +} + +void g_raid_report_disk_state(struct g_raid_disk *disk) { struct g_raid_subdisk *sd; int len, state; uint32_t s; if (disk->d_consumer == NULL) return; if (disk->d_state == G_RAID_DISK_S_FAILED || disk->d_state == G_RAID_DISK_S_STALE_FAILED) { s = G_STATE_FAILED; } else { state = G_RAID_SUBDISK_S_ACTIVE; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { if (sd->sd_state < state) state = sd->sd_state; } if (state == G_RAID_SUBDISK_S_FAILED) s = G_STATE_FAILED; else if (state == G_RAID_SUBDISK_S_NEW || state == G_RAID_SUBDISK_S_REBUILD) s = G_STATE_REBUILD; else if (state == G_RAID_SUBDISK_S_STALE || state == G_RAID_SUBDISK_S_RESYNC) s = G_STATE_RESYNC; else s = G_STATE_ACTIVE; } len = sizeof(s); g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s); G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.", g_raid_get_diskname(disk), s); } void g_raid_change_disk_state(struct g_raid_disk *disk, int state) { G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.", g_raid_get_diskname(disk), g_raid_disk_state2str(disk->d_state), g_raid_disk_state2str(state)); disk->d_state = state; g_raid_report_disk_state(disk); } void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) { G_RAID_DEBUG1(0, sd->sd_softc, "Subdisk %s:%d-%s state changed from %s to %s.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", g_raid_subdisk_state2str(sd->sd_state), g_raid_subdisk_state2str(state)); sd->sd_state = state; if (sd->sd_disk) g_raid_report_disk_state(sd->sd_disk); } void g_raid_change_volume_state(struct g_raid_volume *vol, int state) { G_RAID_DEBUG1(0, vol->v_softc, "Volume %s state changed from %s to %s.", vol->v_name, g_raid_volume_state2str(vol->v_state), g_raid_volume_state2str(state)); vol->v_state = state; } /* * --- Events handling functions --- * Events in geom_raid are used to maintain subdisks and volumes status * from one thread to simplify locking. */ static void g_raid_event_free(struct g_raid_event *ep) { free(ep, M_RAID); } int g_raid_event_send(void *arg, int event, int flags) { struct g_raid_softc *sc; struct g_raid_event *ep; int error; if ((flags & G_RAID_EVENT_VOLUME) != 0) { sc = ((struct g_raid_volume *)arg)->v_softc; } else if ((flags & G_RAID_EVENT_DISK) != 0) { sc = ((struct g_raid_disk *)arg)->d_softc; } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { sc = ((struct g_raid_subdisk *)arg)->sd_softc; } else { sc = arg; } ep = malloc(sizeof(*ep), M_RAID, sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT); if (ep == NULL) return (ENOMEM); ep->e_tgt = arg; ep->e_event = event; ep->e_flags = flags; ep->e_error = 0; G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc); mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); if ((flags & G_RAID_EVENT_WAIT) == 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { mtx_lock(&sc->sc_queue_mtx); MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_raid_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static void g_raid_event_cancel(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep, *tmpep; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if (ep->e_tgt != tgt) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) g_raid_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_queue_mtx); } static int g_raid_event_check(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep; int res = 0; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(ep, &sc->sc_events, e_next) { if (ep->e_tgt != tgt) continue; res = 1; break; } mtx_unlock(&sc->sc_queue_mtx); return (res); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_ndisks(struct g_raid_softc *sc, int state) { struct g_raid_disk *disk; u_int n; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == state || state == -1) n++; } return (n); } /* * Return the number of subdisks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *subdisk; struct g_raid_softc *sc; u_int i, n ; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; for (i = 0; i < vol->v_disks_count; i++) { subdisk = &vol->v_subdisks[i]; if ((state == -1 && subdisk->sd_state != G_RAID_SUBDISK_S_NONE) || subdisk->sd_state == state) n++; } return (n); } /* * Return the first subdisk in given state. * If state is equal to -1, then the first connected disks. */ struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *sd; struct g_raid_softc *sc; u_int i; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((state == -1 && sd->sd_state != G_RAID_SUBDISK_S_NONE) || sd->sd_state == state) return (sd); } return (NULL); } struct g_consumer * g_raid_open_consumer(struct g_raid_softc *sc, const char *name) { struct g_consumer *cp; struct g_provider *pp; g_topology_assert(); if (strncmp(name, "/dev/", 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) return (NULL); cp = g_new_consumer(sc->sc_geom); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 1, 1) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } return (cp); } static u_int g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } u_int g_raid_nopens(struct g_raid_softc *sc) { struct g_raid_volume *vol; u_int opens; opens = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_provider_open != 0) opens++; } return (opens); } static int g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid_nrequests(sc, cp) > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert_not(); g_topology_lock(); cp->private = NULL; if (g_raid_consumer_is_busy(sc, cp)) goto out; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); goto out; } G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); out: g_topology_unlock(); } static void g_raid_orphan(struct g_consumer *cp) { struct g_raid_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, G_RAID_EVENT_DISK); } static void g_raid_clean(struct g_raid_volume *vol, int acw) { struct g_raid_softc *sc; int timeout; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; if (!vol->v_dirty) return; if (vol->v_writes > 0) return; if (acw > 0 || (acw == -1 && vol->v_provider != NULL && vol->v_provider->acw > 0)) { timeout = g_raid_clean_time - (time_uptime - vol->v_last_write); if (!g_raid_shutdown && timeout > 0) return; } vol->v_dirty = 0; G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } static void g_raid_dirty(struct g_raid_volume *vol) { struct g_raid_softc *sc; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; vol->v_dirty = 1; G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; sc = vol->v_softc; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) continue; cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_kerneldump_common_done(struct bio *bp) { bp->bio_flags |= BIO_DONE; } int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct bio bp; vol = tr->tro_volume; sc = vol->v_softc; bzero(&bp, sizeof(bp)); bp.bio_cmd = BIO_WRITE; bp.bio_done = g_raid_tr_kerneldump_common_done; bp.bio_attribute = NULL; bp.bio_offset = offset; bp.bio_length = length; bp.bio_data = virtual; bp.bio_to = vol->v_provider; g_raid_start(&bp); while (!(bp.bio_flags & BIO_DONE)) { G_RAID_DEBUG1(4, sc, "Poll..."); g_raid_poll(sc); DELAY(10); } return (bp.bio_error != 0 ? EIO : 0); } static int g_raid_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; int error; vol = (struct g_raid_volume *)arg; G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.", (long long unsigned)offset, (long long unsigned)length); error = G_RAID_TR_KERNELDUMP(vol->v_tr, virtual, physical, offset, length); return (error); } static void g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) { struct g_kerneldump *gkd; struct g_provider *pp; struct g_raid_volume *vol; gkd = (struct g_kerneldump*)bp->bio_data; pp = bp->bio_to; vol = pp->private; g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); gkd->di.dumper = g_raid_dump; gkd->di.priv = vol; gkd->di.blocksize = vol->v_sectorsize; gkd->di.maxiosize = DFLTPHYS; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > vol->v_mediasize) gkd->length = vol->v_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void +g_raid_candelete(struct g_raid_softc *sc, struct bio *bp) +{ + struct g_provider *pp; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + int *val; + int i; + + val = (int *)bp->bio_data; + pp = bp->bio_to; + vol = pp->private; + *val = 0; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state == G_RAID_SUBDISK_S_NONE) + continue; + if (sd->sd_disk->d_candelete) { + *val = 1; + break; + } + } + g_io_deliver(bp, 0); +} + +static void g_raid_start(struct bio *bp) { struct g_raid_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid_start() should not be called at all. */ // KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, // ("Provider's error should be set (error=%d)(mirror=%s).", // bp->bio_to->error, bp->bio_to->name)); G_RAID_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: break; case BIO_GETATTR: - if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) + if (!strcmp(bp->bio_attribute, "GEOM::candelete")) + g_raid_candelete(sc, bp); + else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_raid_kerneldump(sc, bp); else g_io_deliver(bp, EOPNOTSUPP); return; default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) { G_RAID_DEBUG1(4, sc, "Waking up %p.", sc); wakeup(sc); } } static int g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) { /* * 5 cases: * (1) bp entirely below NO * (2) bp entirely above NO * (3) bp start below, but end in range YES * (4) bp entirely within YES * (5) bp starts within, ends above YES * * lock range 10-19 (offset 10 length 10) * (1) 1-5: first if kicks it out * (2) 30-35: second if kicks it out * (3) 5-15: passes both ifs * (4) 12-14: passes both ifs * (5) 19-20: passes both */ off_t lend = lstart + len - 1; off_t bstart = bp->bio_offset; off_t bend = bp->bio_offset + bp->bio_length - 1; if (bend < lstart) return (0); if (lend < bstart) return (0); return (1); } static int g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) { struct g_raid_lock *lp; sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); LIST_FOREACH(lp, &vol->v_locks, l_next) { if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) return (1); } return (0); } static void g_raid_start_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; /* * Check to see if this item is in a locked range. If so, * queue it to our locked queue and return. We'll requeue * it when the range is unlocked. Internal I/O for the * rebuild/rescan/recovery process is excluded from this * check so we can actually do the recovery. */ if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && g_raid_is_in_locked_range(vol, bp)) { G_RAID_LOGREQ(3, bp, "Defer request."); bioq_insert_tail(&vol->v_locked, bp); return; } /* * If we're actually going to do the write/delete, then * update the idle stats for the volume. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { if (!vol->v_dirty) g_raid_dirty(vol); vol->v_writes++; } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. Then tell * the transformation layer to start the I/O. */ bioq_insert_tail(&vol->v_inflight, bp); G_RAID_LOGREQ(4, bp, "Request started"); G_RAID_TR_IOSTART(vol->v_tr, bp); } static void g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) { off_t off, len; struct bio *nbp; struct g_raid_lock *lp; vol->v_pending_lock = 0; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_pending) { off = lp->l_offset; len = lp->l_length; lp->l_pending = 0; TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { if (g_raid_bio_overlaps(nbp, off, len)) lp->l_pending++; } if (lp->l_pending) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock(%jd, %jd) has %d pending", (intmax_t)off, (intmax_t)(off + len), lp->l_pending); continue; } G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock of %jd to %jd completed", (intmax_t)off, (intmax_t)(off + len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); } } } void g_raid_iodone(struct bio *bp, int error) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; G_RAID_LOGREQ(3, bp, "Request done: %d.", error); /* Update stats if we done write/delete. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { vol->v_writes--; vol->v_last_write = time_uptime; } bioq_remove(&vol->v_inflight, bp); if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) g_raid_finish_with_locked_ranges(vol, bp); getmicrouptime(&vol->v_last_done); g_io_deliver(bp, error); } int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, struct bio *ignore, void *argp) { struct g_raid_softc *sc; struct g_raid_lock *lp; struct bio *bp; sc = vol->v_softc; lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); lp->l_offset = off; lp->l_length = len; lp->l_callback_arg = argp; lp->l_pending = 0; TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { if (bp != ignore && g_raid_bio_overlaps(bp, off, len)) lp->l_pending++; } /* * If there are any writes that are pending, we return EBUSY. All * callers will have to wait until all pending writes clear. */ if (lp->l_pending > 0) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend", (intmax_t)off, (intmax_t)(off+len), lp->l_pending); return (EBUSY); } G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd", (intmax_t)off, (intmax_t)(off+len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); return (0); } int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) { struct g_raid_lock *lp; struct g_raid_softc *sc; struct bio *bp; sc = vol->v_softc; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_offset == off && lp->l_length == len) { LIST_REMOVE(lp, l_next); /* XXX * Right now we just put them all back on the queue * and hope for the best. We hope this because any * locked ranges will go right back on this list * when the worker thread runs. * XXX */ G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd", (intmax_t)lp->l_offset, (intmax_t)(lp->l_offset+lp->l_length)); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) bioq_disksort(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); free(lp, M_RAID); return (0); } } return (EINVAL); } void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) { struct g_consumer *cp; struct g_raid_disk *disk, *tdisk; bp->bio_caller1 = sd; /* * Make sure that the disk is present. Generally it is a task of * transformation layers to not send requests to absent disks, but * it is better to be safe and report situation then sorry. */ if (sd->sd_disk == NULL) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!"); nodisk: bp->bio_from = NULL; bp->bio_to = NULL; bp->bio_error = ENXIO; g_raid_disk_done(bp); return; } disk = sd->sd_disk; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_FAILED) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); goto nodisk; } cp = disk->d_consumer; bp->bio_from = cp; bp->bio_to = cp->provider; cp->index++; /* Update average disks load. */ TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { if (tdisk->d_consumer == NULL) tdisk->d_load = 0; else tdisk->d_load = (tdisk->d_consumer->index * G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; } disk->d_last_offset = bp->bio_offset + bp->bio_length; if (dumping) { G_RAID_LOGREQ(3, bp, "Sending dumping request."); if (bp->bio_cmd == BIO_WRITE) { bp->bio_error = g_raid_subdisk_kerneldump(sd, bp->bio_data, 0, bp->bio_offset, bp->bio_length); } else bp->bio_error = EOPNOTSUPP; g_raid_disk_done(bp); } else { bp->bio_done = g_raid_disk_done; bp->bio_offset += sd->sd_offset; G_RAID_LOGREQ(3, bp, "Sending request."); g_io_request(bp, cp); } } int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, vm_offset_t physical, off_t offset, size_t length) { if (sd->sd_disk == NULL) return (ENXIO); if (sd->sd_disk->d_kd.di.dumper == NULL) return (EOPNOTSUPP); return (dump_write(&sd->sd_disk->d_kd.di, virtual, physical, sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, length)); } static void g_raid_disk_done(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; sd = bp->bio_caller1; sc = sd->sd_softc; mtx_lock(&sc->sc_queue_mtx); bioq_disksort(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) wakeup(sc); } static void g_raid_disk_done_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_volume *vol; g_topology_assert_not(); G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); sd = bp->bio_caller1; sc = sd->sd_softc; vol = sd->sd_volume; if (bp->bio_from != NULL) { bp->bio_from->index--; disk = bp->bio_from->private; if (disk == NULL) g_raid_kill_consumer(sc, bp->bio_from); } bp->bio_offset -= sd->sd_offset; G_RAID_TR_IODONE(vol->v_tr, sd, bp); } static void g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) { if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); else ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid_event_free(ep); } else { ep->e_flags |= G_RAID_EVENT_DONE; G_RAID_DEBUG1(4, sc, "Waking up %p.", ep); mtx_lock(&sc->sc_queue_mtx); wakeup(ep); mtx_unlock(&sc->sc_queue_mtx); } } /* * Worker thread. */ static void g_raid_worker(void *arg) { struct g_raid_softc *sc; struct g_raid_event *ep; struct g_raid_volume *vol; struct bio *bp; struct timeval now, t; int timeout, rv; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ bp = NULL; vol = NULL; rv = 0; ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) TAILQ_REMOVE(&sc->sc_events, ep, e_next); else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) ; else { getmicrouptime(&now); t = now; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr && timevalcmp(&vol->v_last_done, &t, < )) t = vol->v_last_done; } timevalsub(&t, &now); timeout = g_raid_idle_threshold + t.tv_sec * 1000000 + t.tv_usec; if (timeout > 0) { /* * Two steps to avoid overflows at HZ=1000 * and idle timeouts > 2.1s. Some rounding * errors can occur, but they are < 1tick, * which is deemed to be close enough for * this purpose. */ int micpertic = 1000000 / hz; timeout = (timeout + micpertic - 1) / micpertic; sx_xunlock(&sc->sc_lock); MSLEEP(rv, sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "-", timeout); sx_xlock(&sc->sc_lock); goto process; } else rv = EWOULDBLOCK; } mtx_unlock(&sc->sc_queue_mtx); process: if (ep != NULL) { g_raid_handle_event(sc, ep); } else if (bp != NULL) { if (bp->bio_to != NULL && bp->bio_to->geom == sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } else if (rv == EWOULDBLOCK) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_clean(vol, -1); if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr) { t.tv_sec = g_raid_idle_threshold / 1000000; t.tv_usec = g_raid_idle_threshold % 1000000; timevaladd(&t, &vol->v_last_done); getmicrouptime(&now); if (timevalcmp(&t, &now, <= )) { G_RAID_TR_IDLE(vol->v_tr); vol->v_last_done = now; } } } } if (sc->sc_stopping == G_RAID_DESTROY_HARD) g_raid_destroy_node(sc, 1); /* May not return. */ } } static void g_raid_poll(struct g_raid_softc *sc) { struct g_raid_event *ep; struct bio *bp; sx_xlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) { TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); g_raid_handle_event(sc, ep); goto out; } bp = bioq_takefirst(&sc->sc_queue); if (bp != NULL) { mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from == NULL || bp->bio_from->geom != sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } out: sx_xunlock(&sc->sc_lock); } static void g_raid_launch_provider(struct g_raid_volume *vol) { struct g_raid_disk *disk; struct g_raid_softc *sc; struct g_provider *pp; char name[G_RAID_MAX_VOLUMENAME]; off_t off; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); /* Try to name provider with volume name. */ snprintf(name, sizeof(name), "raid/%s", vol->v_name); if (g_raid_name_format == 0 || vol->v_name[0] == 0 || g_provider_by_name(name) != NULL) { /* Otherwise use sequential volume number. */ snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); } pp = g_new_providerf(sc->sc_geom, "%s", name); pp->private = vol; pp->mediasize = vol->v_mediasize; pp->sectorsize = vol->v_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE || vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) { if ((disk = vol->v_subdisks[0].sd_disk) != NULL && disk->d_consumer != NULL && disk->d_consumer->provider != NULL) { pp->stripesize = disk->d_consumer->provider->stripesize; off = disk->d_consumer->provider->stripeoffset; pp->stripeoffset = off + vol->v_subdisks[0].sd_offset; if (off > 0) pp->stripeoffset %= off; } if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) { pp->stripesize *= (vol->v_disks_count - 1); pp->stripeoffset *= (vol->v_disks_count - 1); } } else pp->stripesize = vol->v_strip_size; vol->v_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.", pp->name, vol->v_name); } static void g_raid_destroy_provider(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_provider *pp; struct bio *bp, *tmp; g_topology_assert_not(); sc = vol->v_softc; pp = vol->v_provider; KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); g_topology_lock(); g_error_provider(pp, ENXIO); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { if (bp->bio_to != pp) continue; bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.", pp->name, vol->v_name); g_wither_provider(pp, ENXIO); g_topology_unlock(); vol->v_provider = NULL; } /* * Update device state. */ static int g_raid_update_volume(struct g_raid_volume *vol, u_int event) { struct g_raid_softc *sc; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for volume %s.", g_raid_volume_event2str(event), vol->v_name); switch (event) { case G_RAID_VOLUME_E_DOWN: if (vol->v_provider != NULL) g_raid_destroy_provider(vol); break; case G_RAID_VOLUME_E_UP: if (vol->v_provider == NULL) g_raid_launch_provider(vol); break; case G_RAID_VOLUME_E_START: if (vol->v_tr) G_RAID_TR_START(vol->v_tr); return (0); default: if (sc->sc_md) G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event); return (0); } /* Manage root mount release. */ if (vol->v_starting) { vol->v_starting = 0; G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount); root_mount_rel(vol->v_rootmount); vol->v_rootmount = NULL; } if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); return (0); } /* * Update subdisk state. */ static int g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = sd->sd_softc; vol = sd->sd_volume; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.", g_raid_subdisk_event2str(event), vol->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); if (vol->v_tr) G_RAID_TR_EVENT(vol->v_tr, sd, event); return (0); } /* * Update disk state. */ static int g_raid_update_disk(struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for disk %s.", g_raid_disk_event2str(event), g_raid_get_diskname(disk)); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, disk, event); return (0); } /* * Node event. */ static int g_raid_update_node(struct g_raid_softc *sc, u_int event) { sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for the array.", g_raid_node_event2str(event)); if (event == G_RAID_NODE_E_WAKE) return (0); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, NULL, event); return (0); } static int g_raid_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid_volume *vol; struct g_raid_softc *sc; int dcw, opens, error = 0; g_topology_assert(); sc = pp->geom->softc; vol = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcw = pp->acw + acw; g_topology_unlock(); sx_xlock(&sc->sc_lock); /* Deny new opens while dying. */ if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { error = ENXIO; goto out; } if (dcw == 0) g_raid_clean(vol, dcw); vol->v_provider_open += acr + acw + ace; /* Handle delayed node destruction. */ if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && vol->v_provider_open == 0) { /* Count open volumes. */ opens = g_raid_nopens(sc); if (opens == 0) { sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } } /* Handle open volume destruction. */ if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); out: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md) { struct g_raid_softc *sc; struct g_geom *gp; int error; g_topology_assert(); G_RAID_DEBUG(1, "Creating array %s.", name); gp = g_new_geomf(mp, "%s", name); sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); gp->start = g_raid_start; gp->orphan = g_raid_orphan; gp->access = g_raid_access; gp->dumpconf = g_raid_dumpconf; sc->sc_md = md; sc->sc_geom = gp; sc->sc_flags = 0; TAILQ_INIT(&sc->sc_volumes); TAILQ_INIT(&sc->sc_disks); sx_init(&sc->sc_lock, "graid:lock"); mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_events); bioq_init(&sc->sc_queue); gp->softc = sc; error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, "g_raid %s", name); if (error != 0) { G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_RAID); return (NULL); } G_RAID_DEBUG1(0, sc, "Array %s created.", name); return (sc); } struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id) { struct g_raid_volume *vol, *vol1; int i; G_RAID_DEBUG1(1, sc, "Creating volume %s.", name); vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); vol->v_softc = sc; strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); vol->v_state = G_RAID_VOLUME_S_STARTING; vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN; vol->v_rotate_parity = 1; bioq_init(&vol->v_inflight); bioq_init(&vol->v_locked); LIST_INIT(&vol->v_locks); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { vol->v_subdisks[i].sd_softc = sc; vol->v_subdisks[i].sd_volume = vol; vol->v_subdisks[i].sd_pos = i; vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; } /* Find free ID for this volume. */ g_topology_lock(); vol1 = vol; if (id >= 0) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } } if (vol1 != NULL) { for (id = 0; ; id++) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } if (vol1 == NULL) break; } } vol->v_global_id = id; LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); g_topology_unlock(); /* Delay root mounting. */ vol->v_rootmount = root_mount_hold("GRAID"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount); vol->v_starting = 1; TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); return (vol); } struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc) { struct g_raid_disk *disk; G_RAID_DEBUG1(1, sc, "Creating disk."); disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); disk->d_softc = sc; disk->d_state = G_RAID_DISK_S_NONE; TAILQ_INIT(&disk->d_subdisks); TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); return (disk); } int g_raid_start_volume(struct g_raid_volume *vol) { struct g_raid_tr_class *class; struct g_raid_tr_object *obj; int status; G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name); LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { if (!class->trc_enable) continue; G_RAID_DEBUG1(2, vol->v_softc, "Tasting volume %s for %s transformation.", vol->v_name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->tro_class = class; obj->tro_volume = vol; status = G_RAID_TR_TASTE(obj, vol); if (status != G_RAID_TR_TASTE_FAIL) break; kobj_delete((kobj_t)obj, M_RAID); } if (class == NULL) { G_RAID_DEBUG1(0, vol->v_softc, "No transformation module found for %s.", vol->v_name); vol->v_tr = NULL; g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); return (-1); } G_RAID_DEBUG1(2, vol->v_softc, "Transformation module %s chosen for %s.", class->name, vol->v_name); vol->v_tr = obj; return (0); } int g_raid_destroy_node(struct g_raid_softc *sc, int worker) { struct g_raid_volume *vol, *tmpv; struct g_raid_disk *disk, *tmpd; int error = 0; sc->sc_stopping = G_RAID_DESTROY_HARD; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { if (g_raid_destroy_volume(vol)) error = EBUSY; } if (error) return (error); TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { if (g_raid_destroy_disk(disk)) error = EBUSY; } if (error) return (error); if (sc->sc_md) { G_RAID_MD_FREE(sc->sc_md); kobj_delete((kobj_t)sc->sc_md, M_RAID); sc->sc_md = NULL; } if (sc->sc_geom != NULL) { G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name); g_topology_lock(); sc->sc_geom->softc = NULL; g_wither_geom(sc->sc_geom, ENXIO); g_topology_unlock(); sc->sc_geom = NULL; } else G_RAID_DEBUG(1, "Array destroyed."); if (worker) { g_raid_event_cancel(sc, sc); mtx_destroy(&sc->sc_queue_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); wakeup(&sc->sc_stopping); free(sc, M_RAID); curthread->td_pflags &= ~TDP_GEOM; G_RAID_DEBUG(1, "Thread exiting."); kproc_exit(0); } else { /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_volume(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_disk *disk; int i; sc = vol->v_softc; G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name); vol->v_stopping = 1; if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { if (vol->v_tr) { G_RAID_TR_STOP(vol->v_tr); return (EBUSY); } else vol->v_state = G_RAID_VOLUME_S_STOPPED; } if (g_raid_event_check(sc, vol) != 0) return (EBUSY); if (vol->v_provider != NULL) return (EBUSY); if (vol->v_provider_open != 0) return (EBUSY); if (vol->v_tr) { G_RAID_TR_FREE(vol->v_tr); kobj_delete((kobj_t)vol->v_tr, M_RAID); vol->v_tr = NULL; } if (vol->v_rootmount) root_mount_rel(vol->v_rootmount); g_topology_lock(); LIST_REMOVE(vol, v_global_next); g_topology_unlock(); TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { g_raid_event_cancel(sc, &vol->v_subdisks[i]); disk = vol->v_subdisks[i].sd_disk; if (disk == NULL) continue; TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); } G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name); if (sc->sc_md) G_RAID_MD_FREE_VOLUME(sc->sc_md, vol); g_raid_event_cancel(sc, vol); free(vol, M_RAID); if (sc->sc_stopping == G_RAID_DESTROY_HARD) { /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmp; sc = disk->d_softc; G_RAID_DEBUG1(2, sc, "Destroying disk."); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); sd->sd_disk = NULL; } TAILQ_REMOVE(&sc->sc_disks, disk, d_next); if (sc->sc_md) G_RAID_MD_FREE_DISK(sc->sc_md, disk); g_raid_event_cancel(sc, disk); free(disk, M_RAID); return (0); } int g_raid_destroy(struct g_raid_softc *sc, int how) { int opens; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); /* Count open volumes. */ opens = g_raid_nopens(sc); /* React on some opened volumes. */ if (opens > 0) { switch (how) { case G_RAID_DESTROY_SOFT: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); return (EBUSY); case G_RAID_DESTROY_DELAYED: G_RAID_DEBUG1(1, sc, "Array will be destroyed on last close."); sc->sc_stopping = G_RAID_DESTROY_DELAYED; return (EBUSY); case G_RAID_DESTROY_HARD: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); } } /* Mark node for destruction. */ sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); /* Sleep until node destroyed. */ sx_sleep(&sc->sc_stopping, &sc->sc_lock, PRIBIO | PDROP, "r:destroy", 0); return (0); } static void g_raid_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp, *geom; struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); if (!g_raid_enable) return (NULL); G_RAID_DEBUG(2, "Tasting provider %s.", pp->name); gp = g_new_geomf(mp, "raid:taste"); /* * This orphan function should be never called. */ gp->orphan = g_raid_taste_orphan; cp = g_new_consumer(gp); g_attach(cp, pp); geom = NULL; LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (!class->mdc_enable) continue; G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.", pp->name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_TASTE(obj, mp, cp, &geom); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); if (status != G_RAID_MD_TASTE_FAIL) break; } g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name); return (geom); } int g_raid_create_node_format(const char *format, struct gctl_req *req, struct g_geom **gp) { struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; G_RAID_DEBUG(2, "Creating array for %s metadata.", format); LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (strcasecmp(class->name, format) == 0) break; } if (class == NULL) { G_RAID_DEBUG(1, "No support for %s metadata.", format); return (G_RAID_MD_TASTE_FAIL); } obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); return (status); } static int g_raid_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (sc->sc_stopping == G_RAID_DESTROY_HARD) return; if (sc->sc_md) G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); } void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (disk == NULL) disk = sd->sd_disk; if (disk == NULL) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!"); return; } if (disk->d_state != G_RAID_DISK_S_ACTIVE) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); return; } if (sc->sc_md) G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); } static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, s; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { vol = pp->private; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s\n", indent, vol->v_name); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s%s\n", indent, vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); sbuf_printf(sb, "%s%u\n", indent, vol->v_disks_count); sbuf_printf(sb, "%s%u\n", indent, vol->v_strip_size); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(vol->v_state)); sbuf_printf(sb, "%s%s\n", indent, vol->v_dirty ? "Yes" : "No"); sbuf_printf(sb, "%s", indent); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk != NULL && sd->sd_disk->d_consumer != NULL) { sbuf_printf(sb, "%s ", g_raid_get_diskname(sd->sd_disk)); } else { sbuf_printf(sb, "NONE "); } sbuf_printf(sb, "(%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } sbuf_printf(sb, ")"); if (i + 1 < vol->v_disks_count) sbuf_printf(sb, ", "); } sbuf_printf(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else if (cp != NULL) { disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s", indent, g_raid_disk_state2str(disk->d_state)); if (!TAILQ_EMPTY(&disk->d_subdisks)) { sbuf_printf(sb, " ("); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } if (TAILQ_NEXT(sd, sd_next)) sbuf_printf(sb, ", "); } sbuf_printf(sb, ")"); } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s", indent); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "r%d(%s):%d@%ju", sd->sd_volume->v_global_id, sd->sd_volume->v_name, sd->sd_pos, sd->sd_offset); if (TAILQ_NEXT(sd, sd_next)) sbuf_printf(sb, ", "); } sbuf_printf(sb, "\n"); sbuf_printf(sb, "%s%d\n", indent, disk->d_read_errs); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (sc->sc_md) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_md->mdo_class->name); } if (!TAILQ_EMPTY(&sc->sc_volumes)) { s = 0xff; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_state < s) s = vol->v_state; } sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(s)); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid_softc *sc; struct g_raid_volume *vol; int error; mp = arg; DROP_GIANT(); g_topology_lock(); g_raid_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) g_raid_clean(vol, -1); g_cancel_event(sc); error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); if (error != 0) sx_xunlock(&sc->sc_lock); g_topology_lock(); } g_topology_unlock(); PICKUP_GIANT(); } static void g_raid_init(struct g_class *mp) { g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid_post_sync == NULL) G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); g_raid_started = 1; } static void g_raid_fini(struct g_class *mp) { if (g_raid_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync); g_raid_started = 0; } int g_raid_md_modevent(module_t mod, int type, void *arg) { struct g_raid_md_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_md_classes); if (c == NULL || c->mdc_priority > class->mdc_priority) LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); else { while ((nc = LIST_NEXT(c, mdc_list)) != NULL && nc->mdc_priority < class->mdc_priority) c = nc; LIST_INSERT_AFTER(c, class, mdc_list); } if (g_raid_started) g_retaste(&g_raid_class); break; case MOD_UNLOAD: LIST_REMOVE(class, mdc_list); break; default: error = EOPNOTSUPP; break; } return (error); } int g_raid_tr_modevent(module_t mod, int type, void *arg) { struct g_raid_tr_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_tr_classes); if (c == NULL || c->trc_priority > class->trc_priority) LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); else { while ((nc = LIST_NEXT(c, trc_list)) != NULL && nc->trc_priority < class->trc_priority) c = nc; LIST_INSERT_AFTER(c, class, trc_list); } break; case MOD_UNLOAD: LIST_REMOVE(class, trc_list); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) * to reduce module priority, allowing submodules to register them first. */ static moduledata_t g_raid_mod = { "g_raid", g_modevent, &g_raid_class }; DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); MODULE_VERSION(geom_raid, 0); Index: stable/8/sys/geom/raid/g_raid.h =================================================================== --- stable/8/sys/geom/raid/g_raid.h (revision 243678) +++ stable/8/sys/geom/raid/g_raid.h (revision 243679) @@ -1,468 +1,470 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _G_RAID_H_ #define _G_RAID_H_ #include #include #include #include #ifdef _KERNEL #include #endif #define G_RAID_CLASS_NAME "RAID" #define G_RAID_MAGIC "GEOM::RAID" #define G_RAID_VERSION 0 struct g_raid_md_object; struct g_raid_tr_object; #define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL #define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL #define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \ G_RAID_DEVICE_FLAG_NOFAILSYNC) #ifdef _KERNEL extern u_int g_raid_aggressive_spare; extern u_int g_raid_debug; extern int g_raid_enable; extern int g_raid_read_err_thresh; extern u_int g_raid_start_timeout; extern struct g_class g_raid_class; #define G_RAID_DEBUG(lvl, fmt, ...) do { \ if (g_raid_debug >= (lvl)) { \ if (g_raid_debug > 0) { \ printf("GEOM_RAID[%u]: " fmt "\n", \ lvl, ## __VA_ARGS__); \ } else { \ printf("GEOM_RAID: " fmt "\n", \ ## __VA_ARGS__); \ } \ } \ } while (0) #define G_RAID_DEBUG1(lvl, sc, fmt, ...) do { \ if (g_raid_debug >= (lvl)) { \ if (g_raid_debug > 0) { \ printf("GEOM_RAID[%u]: %s: " fmt "\n", \ lvl, (sc)->sc_name, ## __VA_ARGS__); \ } else { \ printf("GEOM_RAID: %s: " fmt "\n", \ (sc)->sc_name, ## __VA_ARGS__); \ } \ } \ } while (0) #define G_RAID_LOGREQ(lvl, bp, fmt, ...) do { \ if (g_raid_debug >= (lvl)) { \ if (g_raid_debug > 0) { \ printf("GEOM_RAID[%u]: " fmt " ", \ lvl, ## __VA_ARGS__); \ } else \ printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__); \ g_print_bio(bp); \ printf("\n"); \ } \ } while (0) /* * Flags we use to distinguish I/O initiated by the TR layer to maintain * the volume's characteristics, fix subdisks, extra copies of data, etc. * * G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data * for RAID volumes that maintain extra data * and need to rebuild that data. * G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into * doing some desirable action such as bad * block remapping after we detect a bad part * of the disk. * G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released. * * and the following meta item: * G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it * through the range locking which would * otherwise defer the I/O until after that * range is unlocked. */ #define G_RAID_BIO_FLAG_SYNC 0x01 #define G_RAID_BIO_FLAG_REMAP 0x02 #define G_RAID_BIO_FLAG_SPECIAL \ (G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP) #define G_RAID_BIO_FLAG_LOCKED 0x80 struct g_raid_lock { off_t l_offset; off_t l_length; void *l_callback_arg; int l_pending; LIST_ENTRY(g_raid_lock) l_next; }; #define G_RAID_EVENT_WAIT 0x01 #define G_RAID_EVENT_VOLUME 0x02 #define G_RAID_EVENT_SUBDISK 0x04 #define G_RAID_EVENT_DISK 0x08 #define G_RAID_EVENT_DONE 0x10 struct g_raid_event { void *e_tgt; int e_event; int e_flags; int e_error; TAILQ_ENTRY(g_raid_event) e_next; }; #define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */ #define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */ #define G_RAID_DISK_S_FAILED 0x02 /* Failed. */ #define G_RAID_DISK_S_STALE_FAILED 0x03 /* Old failed. */ #define G_RAID_DISK_S_SPARE 0x04 /* Hot-spare. */ #define G_RAID_DISK_S_STALE 0x05 /* Old disk, unused now. */ #define G_RAID_DISK_S_ACTIVE 0x06 /* Operational. */ #define G_RAID_DISK_E_DISCONNECTED 0x01 struct g_raid_disk { struct g_raid_softc *d_softc; /* Back-pointer to softc. */ struct g_consumer *d_consumer; /* GEOM disk consumer. */ void *d_md_data; /* Disk's metadata storage. */ struct g_kerneldump d_kd; /* Kernel dumping method/args. */ + int d_candelete; /* BIO_DELETE supported. */ uint64_t d_flags; /* Additional flags. */ u_int d_state; /* Disk state. */ u_int d_load; /* Disk average load. */ off_t d_last_offset; /* Last head offset. */ int d_read_errs; /* Count of the read errors */ TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */ TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */ }; #define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */ #define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */ #define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */ #define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */ #define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */ #define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */ #define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */ #define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */ #define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */ #define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */ #define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */ #define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */ #define G_RAID_SUBDISK_POS(sd) \ ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0) #define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024) #define G_RAID_SUBDISK_LOAD(sd) \ ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0) #define G_RAID_SUBDISK_LOAD_SCALE 256 struct g_raid_subdisk { struct g_raid_softc *sd_softc; /* Back-pointer to softc. */ struct g_raid_disk *sd_disk; /* Where this subdisk lives. */ struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */ off_t sd_offset; /* Offset on the disk. */ off_t sd_size; /* Size on the disk. */ u_int sd_pos; /* Position in volume. */ u_int sd_state; /* Subdisk state. */ off_t sd_rebuild_pos; /* Rebuild position. */ int sd_recovery; /* Count of recovery reqs. */ TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */ }; #define G_RAID_MAX_SUBDISKS 16 #define G_RAID_MAX_VOLUMENAME 32 #define G_RAID_VOLUME_S_STARTING 0x00 #define G_RAID_VOLUME_S_BROKEN 0x01 #define G_RAID_VOLUME_S_DEGRADED 0x02 #define G_RAID_VOLUME_S_SUBOPTIMAL 0x03 #define G_RAID_VOLUME_S_OPTIMAL 0x04 #define G_RAID_VOLUME_S_UNSUPPORTED 0x05 #define G_RAID_VOLUME_S_STOPPED 0x06 #define G_RAID_VOLUME_S_ALIVE(s) \ ((s) == G_RAID_VOLUME_S_DEGRADED || \ (s) == G_RAID_VOLUME_S_SUBOPTIMAL || \ (s) == G_RAID_VOLUME_S_OPTIMAL) #define G_RAID_VOLUME_E_DOWN 0x00 #define G_RAID_VOLUME_E_UP 0x01 #define G_RAID_VOLUME_E_START 0x10 #define G_RAID_VOLUME_E_STARTMD 0x11 #define G_RAID_VOLUME_RL_RAID0 0x00 #define G_RAID_VOLUME_RL_RAID1 0x01 #define G_RAID_VOLUME_RL_RAID3 0x03 #define G_RAID_VOLUME_RL_RAID4 0x04 #define G_RAID_VOLUME_RL_RAID5 0x05 #define G_RAID_VOLUME_RL_RAID6 0x06 #define G_RAID_VOLUME_RL_RAIDMDF 0x07 #define G_RAID_VOLUME_RL_RAID1E 0x11 #define G_RAID_VOLUME_RL_SINGLE 0x0f #define G_RAID_VOLUME_RL_CONCAT 0x1f #define G_RAID_VOLUME_RL_RAID5E 0x15 #define G_RAID_VOLUME_RL_RAID5EE 0x25 #define G_RAID_VOLUME_RL_RAID5R 0x35 #define G_RAID_VOLUME_RL_UNKNOWN 0xff #define G_RAID_VOLUME_RLQ_NONE 0x00 #define G_RAID_VOLUME_RLQ_R1SM 0x00 #define G_RAID_VOLUME_RLQ_R1MM 0x01 #define G_RAID_VOLUME_RLQ_R3P0 0x00 #define G_RAID_VOLUME_RLQ_R3PN 0x01 #define G_RAID_VOLUME_RLQ_R4P0 0x00 #define G_RAID_VOLUME_RLQ_R4PN 0x01 #define G_RAID_VOLUME_RLQ_R5RA 0x00 #define G_RAID_VOLUME_RLQ_R5RS 0x01 #define G_RAID_VOLUME_RLQ_R5LA 0x02 #define G_RAID_VOLUME_RLQ_R5LS 0x03 #define G_RAID_VOLUME_RLQ_R6RA 0x00 #define G_RAID_VOLUME_RLQ_R6RS 0x01 #define G_RAID_VOLUME_RLQ_R6LA 0x02 #define G_RAID_VOLUME_RLQ_R6LS 0x03 #define G_RAID_VOLUME_RLQ_RMDFRA 0x00 #define G_RAID_VOLUME_RLQ_RMDFRS 0x01 #define G_RAID_VOLUME_RLQ_RMDFLA 0x02 #define G_RAID_VOLUME_RLQ_RMDFLS 0x03 #define G_RAID_VOLUME_RLQ_R1EA 0x00 #define G_RAID_VOLUME_RLQ_R1EO 0x01 #define G_RAID_VOLUME_RLQ_R5ERA 0x00 #define G_RAID_VOLUME_RLQ_R5ERS 0x01 #define G_RAID_VOLUME_RLQ_R5ELA 0x02 #define G_RAID_VOLUME_RLQ_R5ELS 0x03 #define G_RAID_VOLUME_RLQ_R5EERA 0x00 #define G_RAID_VOLUME_RLQ_R5EERS 0x01 #define G_RAID_VOLUME_RLQ_R5EELA 0x02 #define G_RAID_VOLUME_RLQ_R5EELS 0x03 #define G_RAID_VOLUME_RLQ_R5RRA 0x00 #define G_RAID_VOLUME_RLQ_R5RRS 0x01 #define G_RAID_VOLUME_RLQ_R5RLA 0x02 #define G_RAID_VOLUME_RLQ_R5RLS 0x03 #define G_RAID_VOLUME_RLQ_UNKNOWN 0xff struct g_raid_volume; struct g_raid_volume { struct g_raid_softc *v_softc; /* Back-pointer to softc. */ struct g_provider *v_provider; /* GEOM provider. */ struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS]; /* Subdisks of this volume. */ void *v_md_data; /* Volume's metadata storage. */ struct g_raid_tr_object *v_tr; /* Transformation object. */ char v_name[G_RAID_MAX_VOLUMENAME]; /* Volume name. */ u_int v_state; /* Volume state. */ u_int v_raid_level; /* Array RAID level. */ u_int v_raid_level_qualifier; /* RAID level det. */ u_int v_disks_count; /* Number of disks in array. */ u_int v_mdf_pdisks; /* Number of parity disks in RAIDMDF array. */ uint16_t v_mdf_polynomial; /* Polynomial for RAIDMDF. */ uint8_t v_mdf_method; /* Generation method for RAIDMDF. */ u_int v_strip_size; /* Array strip size. */ u_int v_rotate_parity; /* Rotate RAID5R parity after numer of stripes. */ u_int v_sectorsize; /* Volume sector size. */ off_t v_mediasize; /* Volume media size. */ struct bio_queue_head v_inflight; /* In-flight write requests. */ struct bio_queue_head v_locked; /* Blocked I/O requests. */ LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */ int v_pending_lock; /* writes to locked region */ int v_dirty; /* Volume is DIRTY. */ struct timeval v_last_done; /* Time of the last I/O. */ time_t v_last_write; /* Time of the last write. */ u_int v_writes; /* Number of active writes. */ struct root_hold_token *v_rootmount; /* Root mount delay token. */ int v_starting; /* Volume is starting */ int v_stopping; /* Volume is stopping */ int v_provider_open; /* Number of opens. */ int v_global_id; /* Global volume ID (rX). */ TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */ LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */ }; #define G_RAID_NODE_E_WAKE 0x00 #define G_RAID_NODE_E_START 0x01 struct g_raid_softc { struct g_raid_md_object *sc_md; /* Metadata object. */ struct g_geom *sc_geom; /* GEOM class instance. */ uint64_t sc_flags; /* Additional flags. */ TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */ TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */ struct sx sc_lock; /* Main node lock. */ struct proc *sc_worker; /* Worker process. */ struct mtx sc_queue_mtx; /* Worker queues lock. */ TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */ struct bio_queue_head sc_queue; /* Worker I/O queue. */ int sc_stopping; /* Node is stopping */ }; #define sc_name sc_geom->name SYSCTL_DECL(_kern_geom_raid); /* * KOBJ parent class of metadata processing modules. */ struct g_raid_md_class { KOBJ_CLASS_FIELDS; int mdc_enable; int mdc_priority; LIST_ENTRY(g_raid_md_class) mdc_list; }; /* * KOBJ instance of metadata processing module. */ struct g_raid_md_object { KOBJ_FIELDS; struct g_raid_md_class *mdo_class; struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */ }; int g_raid_md_modevent(module_t, int, void *); #define G_RAID_MD_DECLARE(name, label) \ static moduledata_t g_raid_md_##name##_mod = { \ "g_raid_md_" __XSTRING(name), \ g_raid_md_modevent, \ &g_raid_md_##name##_class \ }; \ DECLARE_MODULE(g_raid_md_##name, g_raid_md_##name##_mod, \ SI_SUB_DRIVERS, SI_ORDER_SECOND); \ MODULE_DEPEND(g_raid_md_##name, geom_raid, 0, 0, 0); \ SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, CTLFLAG_RD, \ NULL, label " metadata module"); \ SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \ CTLFLAG_RW, &g_raid_md_##name##_class.mdc_enable, 0, \ "Enable " label " metadata format taste"); \ TUNABLE_INT("kern.geom.raid." __XSTRING(name) ".enable", \ &g_raid_md_##name##_class.mdc_enable) /* * KOBJ parent class of data transformation modules. */ struct g_raid_tr_class { KOBJ_CLASS_FIELDS; int trc_enable; int trc_priority; LIST_ENTRY(g_raid_tr_class) trc_list; }; /* * KOBJ instance of data transformation module. */ struct g_raid_tr_object { KOBJ_FIELDS; struct g_raid_tr_class *tro_class; struct g_raid_volume *tro_volume; /* Back-pointer to volume. */ }; int g_raid_tr_modevent(module_t, int, void *); #define G_RAID_TR_DECLARE(name, label) \ static moduledata_t g_raid_tr_##name##_mod = { \ "g_raid_tr_" __XSTRING(name), \ g_raid_tr_modevent, \ &g_raid_tr_##name##_class \ }; \ DECLARE_MODULE(g_raid_tr_##name, g_raid_tr_##name##_mod, \ SI_SUB_DRIVERS, SI_ORDER_FIRST); \ MODULE_DEPEND(g_raid_tr_##name, geom_raid, 0, 0, 0); \ SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, CTLFLAG_RD, \ NULL, label " transformation module"); \ SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \ CTLFLAG_RW, &g_raid_tr_##name##_class.trc_enable, 0, \ "Enable " label " transformation module taste"); \ TUNABLE_INT("kern.geom.raid." __XSTRING(name) ".enable", \ &g_raid_tr_##name##_class.trc_enable) const char * g_raid_volume_level2str(int level, int qual); int g_raid_volume_str2level(const char *str, int *level, int *qual); const char * g_raid_volume_state2str(int state); const char * g_raid_subdisk_state2str(int state); const char * g_raid_disk_state2str(int state); struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md); int g_raid_create_node_format(const char *format, struct gctl_req *req, struct g_geom **gp); struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id); struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc); const char * g_raid_get_diskname(struct g_raid_disk *disk); +void g_raid_get_disk_info(struct g_raid_disk *disk); int g_raid_start_volume(struct g_raid_volume *vol); int g_raid_destroy_node(struct g_raid_softc *sc, int worker); int g_raid_destroy_volume(struct g_raid_volume *vol); int g_raid_destroy_disk(struct g_raid_disk *disk); void g_raid_iodone(struct bio *bp, int error); void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp); int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, vm_offset_t physical, off_t offset, size_t length); struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc, const char *name); void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp); void g_raid_report_disk_state(struct g_raid_disk *disk); void g_raid_change_disk_state(struct g_raid_disk *disk, int state); void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state); void g_raid_change_volume_state(struct g_raid_volume *vol, int state); void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk); void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk); void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp); int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length); u_int g_raid_ndisks(struct g_raid_softc *sc, int state); u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state); u_int g_raid_nopens(struct g_raid_softc *sc); struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, int state); #define G_RAID_DESTROY_SOFT 0 #define G_RAID_DESTROY_DELAYED 1 #define G_RAID_DESTROY_HARD 2 int g_raid_destroy(struct g_raid_softc *sc, int how); int g_raid_event_send(void *arg, int event, int flags); int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, struct bio *ignore, void *argp); int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len); g_ctl_req_t g_raid_ctl; #endif /* _KERNEL */ #endif /* !_G_RAID_H_ */ Index: stable/8/sys/geom/raid/md_ddf.c =================================================================== --- stable/8/sys/geom/raid/md_ddf.c (revision 243678) +++ stable/8/sys/geom/raid/md_ddf.c (revision 243679) @@ -1,3082 +1,3059 @@ /*- * Copyright (c) 2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "geom/raid/md_ddf.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata"); #define DDF_MAX_DISKS_HARD 128 #define DDF_MAX_DISKS 16 #define DDF_MAX_VDISKS 7 #define DDF_MAX_PARTITIONS 1 #define DECADE (3600*24*(365*10+2)) /* 10 years in seconds. */ struct ddf_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_pd_record *pdr; struct ddf_vd_record *vdr; void *cr; struct ddf_pdd_record *pdd; struct ddf_bbm_log *bbm; }; struct ddf_vol_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD]; }; struct g_raid_md_ddf_perdisk { struct ddf_meta pd_meta; }; struct g_raid_md_ddf_pervolume { struct ddf_vol_meta pv_meta; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; struct g_raid_md_ddf_object { struct g_raid_md_object mdio_base; u_int mdio_bigendian; struct ddf_meta mdio_meta; int mdio_starting; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_started; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_req_t g_raid_md_create_req_ddf; static g_raid_md_taste_t g_raid_md_taste_ddf; static g_raid_md_event_t g_raid_md_event_ddf; static g_raid_md_volume_event_t g_raid_md_volume_event_ddf; static g_raid_md_ctl_t g_raid_md_ctl_ddf; static g_raid_md_write_t g_raid_md_write_ddf; static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf; static g_raid_md_free_disk_t g_raid_md_free_disk_ddf; static g_raid_md_free_volume_t g_raid_md_free_volume_ddf; static g_raid_md_free_t g_raid_md_free_ddf; static kobj_method_t g_raid_md_ddf_methods[] = { KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf), KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf), KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf), KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf), { 0, 0 } }; static struct g_raid_md_class g_raid_md_ddf_class = { "DDF", g_raid_md_ddf_methods, sizeof(struct g_raid_md_ddf_object), .mdc_enable = 1, .mdc_priority = 100 }; #define GET8(m, f) ((m)->f) #define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f)) #define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f)) #define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f)) #define GET8D(m, f) (f) #define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f)) #define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f)) #define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f)) #define GET8P(m, f) (*(f)) #define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f)) #define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f)) #define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f)) #define SET8P(m, f, v) \ (*(f) = (v)) #define SET16P(m, f, v) \ do { \ if ((m)->bigendian) \ be16enc((f), (v)); \ else \ le16enc((f), (v)); \ } while (0) #define SET32P(m, f, v) \ do { \ if ((m)->bigendian) \ be32enc((f), (v)); \ else \ le32enc((f), (v)); \ } while (0) #define SET64P(m, f, v) \ do { \ if ((m)->bigendian) \ be64enc((f), (v)); \ else \ le64enc((f), (v)); \ } while (0) #define SET8(m, f, v) SET8P((m), &((m)->f), (v)) #define SET16(m, f, v) SET16P((m), &((m)->f), (v)) #define SET32(m, f, v) SET32P((m), &((m)->f), (v)) #define SET64(m, f, v) SET64P((m), &((m)->f), (v)) #define SET8D(m, f, v) SET8P((m), &(f), (v)) #define SET16D(m, f, v) SET16P((m), &(f), (v)) #define SET32D(m, f, v) SET32P((m), &(f), (v)) #define SET64D(m, f, v) SET64P((m), &(f), (v)) #define GETCRNUM(m) (GET32((m), hdr->cr_length) / \ GET16((m), hdr->Configuration_Record_Length)) #define GETVDCPTR(m, n) ((struct ddf_vdc_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) #define GETSAPTR(m, n) ((struct ddf_sa_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) static int isff(uint8_t *buf, int size) { int i; for (i = 0; i < size; i++) if (buf[i] != 0xff) return (0); return (1); } static void print_guid(uint8_t *buf) { int i, ascii; ascii = 1; for (i = 0; i < 24; i++) { if (buf[i] != 0 && (buf[i] < ' ' || buf[i] > 127)) { ascii = 0; break; } } if (ascii) { printf("'%.24s'", buf); } else { for (i = 0; i < 24; i++) printf("%02x", buf[i]); } } static void g_raid_md_ddf_print(struct ddf_meta *meta) { struct ddf_vdc_record *vdc; struct ddf_vuc_record *vuc; struct ddf_sa_record *sa; uint64_t *val2; uint32_t val; int i, j, k, num, num2; if (g_raid_debug < 1) return; printf("********* DDF Metadata *********\n"); printf("**** Header ****\n"); printf("DDF_Header_GUID "); print_guid(meta->hdr->DDF_Header_GUID); printf("\n"); printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]); printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number)); printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp)); printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag)); printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag)); printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping)); printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA)); printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA)); printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length)); printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA)); printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries)); printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries)); printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions)); printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length)); printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries)); printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length)); printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length)); printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length)); printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length)); printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length)); printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length)); printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length)); printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length)); printf("**** Controler Data ****\n"); printf("Controller_GUID "); print_guid(meta->cdr->Controller_GUID); printf("\n"); printf("Controller_Type 0x%04x%04x 0x%04x%04x\n", GET16(meta, cdr->Controller_Type.Vendor_ID), GET16(meta, cdr->Controller_Type.Device_ID), GET16(meta, cdr->Controller_Type.SubVendor_ID), GET16(meta, cdr->Controller_Type.SubDevice_ID)); printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]); printf("**** Physical Disk Records ****\n"); printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs)); printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported)); for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) { if (isff(meta->pdr->entry[j].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff) continue; printf("PD_GUID "); print_guid(meta->pdr->entry[j].PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdr->entry[j].PD_Reference)); printf("PD_Type 0x%04x\n", GET16(meta, pdr->entry[j].PD_Type)); printf("PD_State 0x%04x\n", GET16(meta, pdr->entry[j].PD_State)); printf("Configured_Size %ju\n", GET64(meta, pdr->entry[j].Configured_Size)); printf("Block_Size %u\n", GET16(meta, pdr->entry[j].Block_Size)); } printf("**** Virtual Disk Records ****\n"); printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs)); printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported)); for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) { if (isff(meta->vdr->entry[j].VD_GUID, 24)) continue; printf("VD_GUID "); print_guid(meta->vdr->entry[j].VD_GUID); printf("\n"); printf("VD_Number 0x%04x\n", GET16(meta, vdr->entry[j].VD_Number)); printf("VD_Type 0x%04x\n", GET16(meta, vdr->entry[j].VD_Type)); printf("VD_State 0x%02x\n", GET8(meta, vdr->entry[j].VD_State)); printf("Init_State 0x%02x\n", GET8(meta, vdr->entry[j].Init_State)); printf("Drive_Failures_Remaining %u\n", GET8(meta, vdr->entry[j].Drive_Failures_Remaining)); printf("VD_Name '%.16s'\n", (char *)&meta->vdr->entry[j].VD_Name); } printf("**** Configuration Records ****\n"); num = GETCRNUM(meta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(meta, j); val = GET32D(meta, vdc->Signature); switch (val) { case DDF_VDCR_SIGNATURE: printf("** Virtual Disk Configuration **\n"); printf("VD_GUID "); print_guid(vdc->VD_GUID); printf("\n"); printf("Timestamp 0x%08x\n", GET32D(meta, vdc->Timestamp)); printf("Sequence_Number 0x%08x\n", GET32D(meta, vdc->Sequence_Number)); printf("Primary_Element_Count %u\n", GET16D(meta, vdc->Primary_Element_Count)); printf("Stripe_Size %u\n", GET8D(meta, vdc->Stripe_Size)); printf("Primary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Primary_RAID_Level)); printf("RLQ 0x%02x\n", GET8D(meta, vdc->RLQ)); printf("Secondary_Element_Count %u\n", GET8D(meta, vdc->Secondary_Element_Count)); printf("Secondary_Element_Seq %u\n", GET8D(meta, vdc->Secondary_Element_Seq)); printf("Secondary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Secondary_RAID_Level)); printf("Block_Count %ju\n", GET64D(meta, vdc->Block_Count)); printf("VD_Size %ju\n", GET64D(meta, vdc->VD_Size)); printf("Block_Size %u\n", GET16D(meta, vdc->Block_Size)); printf("Rotate_Parity_count %u\n", GET8D(meta, vdc->Rotate_Parity_count)); printf("Associated_Spare_Disks"); for (i = 0; i < 8; i++) { if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff) printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i])); } printf("\n"); printf("Cache_Flags %016jx\n", GET64D(meta, vdc->Cache_Flags)); printf("BG_Rate %u\n", GET8D(meta, vdc->BG_Rate)); printf("MDF_Parity_Disks %u\n", GET8D(meta, vdc->MDF_Parity_Disks)); printf("MDF_Parity_Generator_Polynomial 0x%04x\n", GET16D(meta, vdc->MDF_Parity_Generator_Polynomial)); printf("MDF_Constant_Generation_Method 0x%02x\n", GET8D(meta, vdc->MDF_Constant_Generation_Method)); printf("Physical_Disks "); num2 = GET16D(meta, vdc->Primary_Element_Count); val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]); for (i = 0; i < num2; i++) printf(" 0x%08x @ %ju", GET32D(meta, vdc->Physical_Disk_Sequence[i]), GET64P(meta, val2 + i)); printf("\n"); break; case DDF_VUCR_SIGNATURE: printf("** Vendor Unique Configuration **\n"); vuc = (struct ddf_vuc_record *)vdc; printf("VD_GUID "); print_guid(vuc->VD_GUID); printf("\n"); break; case DDF_SA_SIGNATURE: printf("** Spare Assignment Configuration **\n"); sa = (struct ddf_sa_record *)vdc; printf("Timestamp 0x%08x\n", GET32D(meta, sa->Timestamp)); printf("Spare_Type 0x%02x\n", GET8D(meta, sa->Spare_Type)); printf("Populated_SAEs %u\n", GET16D(meta, sa->Populated_SAEs)); printf("MAX_SAE_Supported %u\n", GET16D(meta, sa->MAX_SAE_Supported)); for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) { if (isff(sa->entry[i].VD_GUID, 24)) continue; printf("VD_GUID "); for (k = 0; k < 24; k++) printf("%02x", sa->entry[i].VD_GUID[k]); printf("\n"); printf("Secondary_Element %u\n", GET16D(meta, sa->entry[i].Secondary_Element)); } break; case 0x00000000: case 0xFFFFFFFF: break; default: printf("Unknown configuration signature %08x\n", val); break; } } printf("**** Physical Disk Data ****\n"); printf("PD_GUID "); print_guid(meta->pdd->PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdd->PD_Reference)); printf("Forced_Ref_Flag 0x%02x\n", GET8(meta, pdd->Forced_Ref_Flag)); printf("Forced_PD_GUID_Flag 0x%02x\n", GET8(meta, pdd->Forced_PD_GUID_Flag)); } static int ddf_meta_find_pd(struct ddf_meta *meta, uint8_t *GUID, uint32_t PD_Reference) { int i; for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (GUID != NULL) { if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0) return (i); } else if (PD_Reference != 0xffffffff) { if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference) return (i); } else if (isff(meta->pdr->entry[i].PD_GUID, 24)) return (i); } if (GUID == NULL && PD_Reference == 0xffffffff) { if (i >= GET16(meta, pdr->Max_PDE_Supported)) return (-1); SET16(meta, pdr->Populated_PDEs, i + 1); return (i); } return (-1); } static int ddf_meta_find_vd(struct ddf_meta *meta, uint8_t *GUID) { int i; for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) { if (GUID != NULL) { if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0) return (i); } else if (isff(meta->vdr->entry[i].VD_GUID, 24)) return (i); } if (GUID == NULL) { if (i >= GET16(meta, vdr->Max_VDE_Supported)) return (-1); SET16(meta, vdr->Populated_VDEs, i + 1); return (i); } return (-1); } static struct ddf_vdc_record * ddf_meta_find_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GUID != NULL) { if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE && memcmp(vdc->VD_GUID, GUID, 24) == 0) return (vdc); } else if (GET32D(meta, vdc->Signature) == 0xffffffff || GET32D(meta, vdc->Signature) == 0) return (vdc); } return (NULL); } static int ddf_meta_count_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num, cnt; cnt = 0; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; if (GUID == NULL || memcmp(vdc->VD_GUID, GUID, 24) == 0) cnt++; } return (cnt); } static int ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference, int *bvdp, int *posp) { int i, bvd, pos; i = 0; for (bvd = 0; bvd < GET16(vmeta, vdc->Secondary_Element_Count); bvd++) { if (vmeta->bvdc[bvd] == NULL) { i += GET16(vmeta, vdc->Primary_Element_Count); // XXX continue; } for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count); pos++, i++) { if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) == PD_Reference) { if (bvdp != NULL) *bvdp = bvd; if (posp != NULL) *posp = pos; return (i); } } } return (-1); } static struct ddf_sa_record * ddf_meta_find_sa(struct ddf_meta *meta, int create) { struct ddf_sa_record *sa; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE) return (sa); } if (create) { for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == 0xffffffff || GET32D(meta, sa->Signature) == 0) return (sa); } } return (NULL); } static void ddf_meta_create(struct g_raid_disk *disk, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct ddf_meta *meta; struct ddf_pd_entry *pde; off_t anchorlba; u_int ss, pos, size; int len, error; char serial_buffer[24]; if (sample->hdr == NULL) sample = NULL; mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; ss = disk->d_consumer->provider->sectorsize; anchorlba = disk->d_consumer->provider->mediasize / ss - 1; meta->sectorsize = ss; meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian; getnanotime(&ts); clock_ts_to_ct(&ts, &ct); /* Header */ meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memset(meta->hdr, 0xff, ss); if (sample) { memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header)); if (ss != sample->sectorsize) { SET32(meta, hdr->WorkSpace_Length, (GET32(sample, hdr->WorkSpace_Length) * sample->sectorsize + ss - 1) / ss); SET16(meta, hdr->Configuration_Record_Length, (GET16(sample, hdr->Configuration_Record_Length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->cd_length, (GET32(sample, hdr->cd_length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->pdr_length, (GET32(sample, hdr->pdr_length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->vdr_length, (GET32(sample, hdr->vdr_length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->cr_length, (GET32(sample, hdr->cr_length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->pdd_length, (GET32(sample, hdr->pdd_length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->bbmlog_length, (GET32(sample, hdr->bbmlog_length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->Diagnostic_Space, (GET32(sample, hdr->bbmlog_length) * sample->sectorsize + ss - 1) / ss); SET32(meta, hdr->Vendor_Specific_Logs, (GET32(sample, hdr->bbmlog_length) * sample->sectorsize + ss - 1) / ss); } } else { SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE); snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x", (u_int)(ts.tv_sec - DECADE), arc4random()); memcpy(meta->hdr->DDF_rev, "02.00.00", 8); SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE)); SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss); SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1); SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS); SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS); SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS); SET16(meta, hdr->Configuration_Record_Length, (sizeof(struct ddf_vdc_record) + (4 + 8) * GET16(meta, hdr->Max_Primary_Element_Entries) + ss - 1) / ss); SET32(meta, hdr->cd_length, (sizeof(struct ddf_cd_record) + ss - 1) / ss); SET32(meta, hdr->pdr_length, (sizeof(struct ddf_pd_record) + sizeof(struct ddf_pd_entry) * GET16(meta, hdr->Max_PD_Entries) + ss - 1) / ss); SET32(meta, hdr->vdr_length, (sizeof(struct ddf_vd_record) + sizeof(struct ddf_vd_entry) * GET16(meta, hdr->Max_VD_Entries) + ss - 1) / ss); SET32(meta, hdr->cr_length, GET16(meta, hdr->Configuration_Record_Length) * (GET16(meta, hdr->Max_Partitions) + 1)); SET32(meta, hdr->pdd_length, (sizeof(struct ddf_pdd_record) + ss - 1) / ss); SET32(meta, hdr->bbmlog_length, 0); SET32(meta, hdr->Diagnostic_Space_Length, 0); SET32(meta, hdr->Vendor_Specific_Logs_Length, 0); } pos = 1; SET32(meta, hdr->cd_section, pos); pos += GET32(meta, hdr->cd_length); SET32(meta, hdr->pdr_section, pos); pos += GET32(meta, hdr->pdr_length); SET32(meta, hdr->vdr_section, pos); pos += GET32(meta, hdr->vdr_length); SET32(meta, hdr->cr_section, pos); pos += GET32(meta, hdr->cr_length); SET32(meta, hdr->pdd_section, pos); pos += GET32(meta, hdr->pdd_length); SET32(meta, hdr->bbmlog_section, GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->bbmlog_length); SET32(meta, hdr->Diagnostic_Space, GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->Diagnostic_Space_Length); SET32(meta, hdr->Vendor_Specific_Logs, GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff); pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1); SET64(meta, hdr->Primary_Header_LBA, anchorlba - pos); SET64(meta, hdr->Secondary_Header_LBA, 0xffffffffffffffffULL); SET64(meta, hdr->WorkSpace_LBA, anchorlba + 1 - 32 * 1024 * 1024 / ss); /* Controller Data */ size = GET32(meta, hdr->cd_length) * ss; meta->cdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cdr, 0xff, size); SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE); memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24); memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16); /* Physical Drive Records. */ size = GET32(meta, hdr->pdr_length) * ss; meta->pdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdr, 0xff, size); SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE); SET16(meta, pdr->Populated_PDEs, 1); SET16(meta, pdr->Max_PDE_Supported, GET16(meta, hdr->Max_PD_Entries)); pde = &meta->pdr->entry[0]; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer); if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20) snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer); else snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xffff); SET32D(meta, pde->PD_Reference, arc4random()); SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE); SET16D(meta, pde->PD_State, 0); SET64D(meta, pde->Configured_Size, anchorlba + 1 - 32 * 1024 * 1024 / ss); SET16D(meta, pde->Block_Size, ss); /* Virtual Drive Records. */ size = GET32(meta, hdr->vdr_length) * ss; meta->vdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdr, 0xff, size); SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE); SET32(meta, vdr->Populated_VDEs, 0); SET16(meta, vdr->Max_VDE_Supported, GET16(meta, hdr->Max_VD_Entries)); /* Configuration Records. */ size = GET32(meta, hdr->cr_length) * ss; meta->cr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cr, 0xff, size); /* Physical Disk Data. */ size = GET32(meta, hdr->pdd_length) * ss; meta->pdd = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdd, 0xff, size); SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE); memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24); SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference)); SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF); SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID); /* Bad Block Management Log. */ if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; meta->bbm = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->bbm, 0xff, size); SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE); SET32(meta, bbm->Entry_Count, 0); SET32(meta, bbm->Spare_Block_Count, 0); } } static void ddf_meta_copy(struct ddf_meta *dst, struct ddf_meta *src) { struct ddf_header *hdr; u_int ss; hdr = src->hdr; dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss); dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss); dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss); dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss); if (src->bbm != NULL) { dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss); } } static void ddf_meta_update(struct ddf_meta *meta, struct ddf_meta *src) { struct ddf_pd_entry *pde, *spde; int i, j; for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) { spde = &src->pdr->entry[i]; if (isff(spde->PD_GUID, 24)) continue; j = ddf_meta_find_pd(meta, NULL, GET32(src, pdr->entry[i].PD_Reference)); if (j < 0) { j = ddf_meta_find_pd(meta, NULL, 0xffffffff); pde = &meta->pdr->entry[j]; memcpy(pde, spde, sizeof(*pde)); } else { pde = &meta->pdr->entry[j]; SET16D(meta, pde->PD_State, GET16D(meta, pde->PD_State) | GET16D(src, pde->PD_State)); } } } static void ddf_meta_free(struct ddf_meta *meta) { if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->pdr != NULL) { free(meta->pdr, M_MD_DDF); meta->pdr = NULL; } if (meta->vdr != NULL) { free(meta->vdr, M_MD_DDF); meta->vdr = NULL; } if (meta->cr != NULL) { free(meta->cr, M_MD_DDF); meta->cr = NULL; } if (meta->pdd != NULL) { free(meta->pdd, M_MD_DDF); meta->pdd = NULL; } if (meta->bbm != NULL) { free(meta->bbm, M_MD_DDF); meta->bbm = NULL; } } static void ddf_vol_meta_create(struct ddf_vol_meta *meta, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct ddf_header *hdr; u_int ss, size; hdr = sample->hdr; meta->bigendian = sample->bigendian; ss = meta->sectorsize = sample->sectorsize; meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, sample->hdr, ss); meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss); meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry)); getnanotime(&ts); clock_ts_to_ct(&ts, &ct); snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xf); size = GET16(sample, hdr->Configuration_Record_Length) * ss; meta->vdc = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdc, 0xff, size); SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE); memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24); SET32(meta, vdc->Sequence_Number, 0); } static void ddf_vol_meta_update(struct ddf_vol_meta *dst, struct ddf_meta *src, uint8_t *GUID, int started) { struct ddf_header *hdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; int vnew, bvnew, bvd, size; u_int ss; hdr = src->hdr; vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)]; vdc = ddf_meta_find_vdc(src, GUID); bvd = GET8D(src, vdc->Secondary_Element_Seq); size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize; if (dst->vdc == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, vdc->Sequence_Number))) > 0)) vnew = 1; else vnew = 0; if (dst->bvdc[bvd] == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, bvdc[bvd]->Sequence_Number))) > 0)) bvnew = 1; else bvnew = 0; if (vnew) { dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; if (dst->hdr != NULL) free(dst->hdr, M_MD_DDF); dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); if (dst->cdr != NULL) free(dst->cdr, M_MD_DDF); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); if (dst->vde != NULL) free(dst->vde, M_MD_DDF); dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry)); if (dst->vdc != NULL) free(dst->vdc, M_MD_DDF); dst->vdc = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->vdc, vdc, size); } if (bvnew) { if (dst->bvdc[bvd] != NULL) free(dst->bvdc[bvd], M_MD_DDF); dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->bvdc[bvd], vdc, size); } } static void ddf_vol_meta_free(struct ddf_vol_meta *meta) { int i; if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->vde != NULL) { free(meta->vde, M_MD_DDF); meta->vde = NULL; } if (meta->vdc != NULL) { free(meta->vdc, M_MD_DDF); meta->vdc = NULL; } for (i = 0; i < DDF_MAX_DISKS_HARD; i++) { if (meta->bvdc[i] != NULL) { free(meta->bvdc[i], M_MD_DDF); meta->bvdc[i] = NULL; } } } static int ddf_meta_unused_range(struct ddf_meta *meta, off_t *off, off_t *size) { struct ddf_vdc_record *vdc; off_t beg[32], end[32], beg1, end1; uint64_t *offp; int i, j, n, num, pos; uint32_t ref; *off = 0; *size = 0; ref = GET32(meta, pdd->PD_Reference); pos = ddf_meta_find_pd(meta, NULL, ref); beg[0] = 0; end[0] = GET64(meta, pdr->entry[pos].Configured_Size); n = 1; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++) if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref) break; if (pos == GET16D(meta, vdc->Primary_Element_Count)) continue; offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[ GET16(meta, hdr->Max_Primary_Element_Entries)]); beg1 = GET64P(meta, offp + pos); end1 = beg1 + GET64D(meta, vdc->Block_Count); for (j = 0; j < n; j++) { if (beg[j] >= end1 || end[j] <= beg1 ) continue; if (beg[j] < beg1 && end[j] > end1) { beg[n] = end1; end[n] = end[j]; end[j] = beg1; n++; } else if (beg[j] < beg1) end[j] = beg1; else beg[j] = end1; } } for (j = 0; j < n; j++) { if (end[j] - beg[j] > *size) { *off = beg[j]; *size = end[j] - beg[j]; } } return ((*size > 0) ? 1 : 0); } static void ddf_meta_get_name(struct ddf_meta *meta, int num, char *buf) { const char *b; int i; b = meta->vdr->entry[num].VD_Name; for (i = 15; i >= 0; i--) if (b[i] != 0x20) break; memcpy(buf, b, i + 1); buf[i + 1] = 0; } static void ddf_meta_put_name(struct ddf_vol_meta *meta, char *buf) { int len; len = min(strlen(buf), 16); memset(meta->vde->VD_Name, 0x20, 16); memcpy(meta->vde->VD_Name, buf, len); } static int ddf_meta_read(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_header *ahdr, *hdr; char *abuf, *buf; off_t plba, slba, lba; int error, len, i; u_int ss; uint32_t val; ddf_meta_free(meta); pp = cp->provider; ss = meta->sectorsize = pp->sectorsize; /* Read anchor block. */ abuf = g_read_data(cp, pp->mediasize - ss, ss, &error); if (abuf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (error); } ahdr = (struct ddf_header *)abuf; /* Check if this is an DDF RAID struct */ if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 1; else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 0; else { G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name); error = EINVAL; goto done; } if (ahdr->Header_Type != DDF_HEADER_ANCHOR) { G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name); error = EINVAL; goto done; } meta->hdr = ahdr; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); meta->hdr = NULL; if (crc32(ahdr, ss) != val) { G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name); error = EINVAL; goto done; } if ((plba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } if (slba != -1 && (slba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } lba = plba; doread: error = 0; ddf_meta_free(meta); /* Read header block. */ buf = g_read_data(cp, lba * ss, ss, &error); if (buf == NULL) { readerror: G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).", (lba == plba) ? "primary" : "secondary", pp->name, error); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name); goto done; } meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, buf, ss); g_free(buf); hdr = meta->hdr; val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); if (hdr->Signature != ahdr->Signature || crc32(meta->hdr, ss) != val || memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) || GET64(meta, hdr->Primary_Header_LBA) != plba || GET64(meta, hdr->Secondary_Header_LBA) != slba) { hdrerror: G_RAID_DEBUG(1, "DDF %s metadata check failed on %s", (lba == plba) ? "primary" : "secondary", pp->name); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name); error = EINVAL; goto done; } if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) || (lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY)) goto hdrerror; len = 1; len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length)); len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length)); len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length)); len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length)); len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length)); if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->bbmlog_length)); if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length)); if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length)); if ((plba + len) * ss >= pp->mediasize) goto hdrerror; if (slba != -1 && (slba + len) * ss >= pp->mediasize) goto hdrerror; /* Workaround for Adaptec implementation. */ if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) { SET16(meta, hdr->Max_Primary_Element_Entries, min(GET16(meta, hdr->Max_PD_Entries), (GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12)); } /* Read controller data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, GET32(meta, hdr->cd_length) * ss, &error); if (buf == NULL) goto readerror; meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss); g_free(buf); if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE) goto hdrerror; /* Read physical disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, GET32(meta, hdr->pdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss); g_free(buf); if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE) goto hdrerror; /* Read virtual disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, GET32(meta, hdr->vdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss); g_free(buf); if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE) goto hdrerror; /* Read configuration records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, GET32(meta, hdr->cr_length) * ss, &error); if (buf == NULL) goto readerror; meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss); g_free(buf); /* Read physical disk data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, GET32(meta, hdr->pdd_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss); g_free(buf); if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE) goto hdrerror; i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference)); if (i < 0) goto hdrerror; /* Read BBM Log. */ if (GET32(meta, hdr->bbmlog_section) != 0xffffffff && GET32(meta, hdr->bbmlog_length) != 0) { buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, GET32(meta, hdr->bbmlog_length) * ss, &error); if (buf == NULL) goto readerror; meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss); g_free(buf); if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE) goto hdrerror; } done: g_free(abuf); if (error != 0) ddf_meta_free(meta); return (error); } static int ddf_meta_write(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_vdc_record *vdc; off_t alba, plba, slba, lba; u_int ss, size; int error, i, num; pp = cp->provider; ss = pp->sectorsize; lba = alba = pp->mediasize / ss - 1; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); next: SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR : (lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY); SET32(meta, hdr->CRC, 0xffffffff); SET32(meta, hdr->CRC, crc32(meta->hdr, ss)); error = g_write_data(cp, lba * ss, meta->hdr, ss); if (error != 0) { err: G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); if (lba != alba) goto done; } if (lba == alba) { lba = plba; goto next; } size = GET32(meta, hdr->cd_length) * ss; SET32(meta, cdr->CRC, 0xffffffff); SET32(meta, cdr->CRC, crc32(meta->cdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, meta->cdr, size); if (error != 0) goto err; size = GET32(meta, hdr->pdr_length) * ss; SET32(meta, pdr->CRC, 0xffffffff); SET32(meta, pdr->CRC, crc32(meta->pdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, meta->pdr, size); if (error != 0) goto err; size = GET32(meta, hdr->vdr_length) * ss; SET32(meta, vdr->CRC, 0xffffffff); SET32(meta, vdr->CRC, crc32(meta->vdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, meta->vdr, size); if (error != 0) goto err; size = GET16(meta, hdr->Configuration_Record_Length) * ss; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); SET32D(meta, vdc->CRC, 0xffffffff); SET32D(meta, vdc->CRC, crc32(vdc, size)); } error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, meta->cr, size * num); if (error != 0) goto err; size = GET32(meta, hdr->pdd_length) * ss; SET32(meta, pdd->CRC, 0xffffffff); SET32(meta, pdd->CRC, crc32(meta->pdd, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, meta->pdd, size); if (error != 0) goto err; if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; SET32(meta, bbm->CRC, 0xffffffff); SET32(meta, bbm->CRC, crc32(meta->bbm, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, meta->bbm, size); if (error != 0) goto err; } done: if (lba == plba && slba != -1) { lba = slba; goto next; } return (error); } static int ddf_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_DDF); return (error); } static struct g_raid_volume * g_raid_md_ddf_get_volume(struct g_raid_softc *sc, uint8_t *GUID) { struct g_raid_volume *vol; struct g_raid_md_ddf_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0) break; } return (vol); } static struct g_raid_disk * g_raid_md_ddf_get_disk(struct g_raid_softc *sc, uint8_t *GUID, uint32_t id) { struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct ddf_meta *meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; if (GUID != NULL) { if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0) break; } else { if (GET32(meta, pdd->PD_Reference) == id) break; } } return (disk); } static int g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_ddf_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_ddf_purge_disks(struct g_raid_softc *sc) { #if 0 struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_ddf_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_ddf_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_DDF); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); #endif return (0); } static int g_raid_md_ddf_supported(int level, int qual, int disks, int force) { if (disks > DDF_MAX_DISKS_HARD) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (qual == G_RAID_VOLUME_RLQ_R1SM) { if (!force && disks != 2) return (0); } else if (qual == G_RAID_VOLUME_RLQ_R1MM) { if (!force && disks != 3) return (0); } else return (0); break; case G_RAID_VOLUME_RL_RAID3: if (qual != G_RAID_VOLUME_RLQ_R3P0 && qual != G_RAID_VOLUME_RLQ_R3PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID4: if (qual != G_RAID_VOLUME_RLQ_R4P0 && qual != G_RAID_VOLUME_RLQ_R4PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (qual != G_RAID_VOLUME_RLQ_R5RA && qual != G_RAID_VOLUME_RLQ_R5RS && qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID6: if (qual != G_RAID_VOLUME_RLQ_R6RA && qual != G_RAID_VOLUME_RLQ_R6RS && qual != G_RAID_VOLUME_RLQ_R6LA && qual != G_RAID_VOLUME_RLQ_R6LS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAIDMDF: if (qual != G_RAID_VOLUME_RLQ_RMDFRA && qual != G_RAID_VOLUME_RLQ_RMDFRS && qual != G_RAID_VOLUME_RLQ_RMDFLA && qual != G_RAID_VOLUME_RLQ_RMDFLS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (qual != G_RAID_VOLUME_RLQ_R1EA && qual != G_RAID_VOLUME_RLQ_R1EO) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5E: if (qual != G_RAID_VOLUME_RLQ_R5ERA && qual != G_RAID_VOLUME_RLQ_R5ERS && qual != G_RAID_VOLUME_RLQ_R5ELA && qual != G_RAID_VOLUME_RLQ_R5ELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5EE: if (qual != G_RAID_VOLUME_RLQ_R5EERA && qual != G_RAID_VOLUME_RLQ_R5EERS && qual != G_RAID_VOLUME_RLQ_R5EELA && qual != G_RAID_VOLUME_RLQ_R5EELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5R: if (qual != G_RAID_VOLUME_RLQ_R5RRA && qual != G_RAID_VOLUME_RLQ_R5RRS && qual != G_RAID_VOLUME_RLQ_R5RLA && qual != G_RAID_VOLUME_RLQ_R5RLS) return (0); if (disks < 3) return (0); break; default: return (0); } return (1); } static int g_raid_md_ddf_start_disk(struct g_raid_disk *disk, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_meta *pdmeta, *gmeta; struct ddf_vdc_record *vdc1; struct ddf_sa_record *sa; off_t size, eoff = 0, esize = 0; uint64_t *val2; int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos; int i, resurrection = 0; uint32_t reference; sc = disk->d_softc; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; reference = GET32(&pd->pd_meta, pdd->PD_Reference); pv = vol->v_md_data; vmeta = &pv->pv_meta; gmeta = &mdi->mdio_meta; /* Find disk position in metadata by it's reference. */ disk_pos = ddf_meta_find_disk(vmeta, reference, &md_disk_bvd, &md_disk_pos); md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not a present part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If disk has some metadata for this volume - erase. */ if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) SET32D(pdmeta, vdc1->Signature, 0xffffffff); /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { G_RAID_DEBUG1(1, sc, "No free partitions on disk %s", g_raid_get_diskname(disk)); goto nofit; } ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } eoff *= pd->pd_meta.sectorsize; esize *= pd->pd_meta.sectorsize; size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && esize < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), esize, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size; md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX } else { nofit: if (disk->d_state == G_RAID_DISK_S_NONE) g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } /* * If spare is committable, delete spare record. * Othersize, mark it active and leave there. */ sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa != NULL) { if ((GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_REVERTIBLE) == 0) { SET32D(&pd->pd_meta, sa->Signature, 0xffffffff); } else { SET8D(&pd->pd_meta, sa->Spare_Type, GET8D(&pd->pd_meta, sa->Spare_Type) | DDF_SAR_TYPE_ACTIVE); } } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (GET8(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = eoff; sd->sd_size = esize; } else if (pdmeta->cr != NULL && (vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) { val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512; sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (GET8(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if ((GET8(gmeta, pdr->entry[md_pde_pos].PD_State) & (DDF_PDE_FAILED | DDF_PDE_REBUILD)) != 0) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = 0; } else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 || (GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) != DDF_VDE_INIT_FULL) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_ddf_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) < GET16(&pd->pd_meta, hdr->Max_Partitions)) { update = g_raid_md_ddf_start_disk(disk, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_ddf(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_ddf_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; uint64_t *val2; int i, j, bvd; sc = vol->v_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pv = vol->v_md_data; vmeta = &pv->pv_meta; vdc = vmeta->vdc; vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level); vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ); if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 && GET8(vmeta, vdc->Secondary_RAID_Level) == 0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_sectorsize = GET16(vmeta, vdc->Block_Size); if (vol->v_sectorsize == 0xffff) vol->v_sectorsize = vmeta->sectorsize; vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size); vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) * GET8(vmeta, vdc->Secondary_Element_Count); vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks); vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial); vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method); if (GET8(vmeta, vdc->Rotate_Parity_count) > 31) vol->v_rotate_parity = 1; else vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count); vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize; for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) { if (j == GET16(vmeta, vdc->Primary_Element_Count)) { j = 0; bvd++; } sd = &vol->v_subdisks[i]; if (vmeta->bvdc[bvd] == NULL) { sd->sd_offset = 0; sd->sd_size = GET64(vmeta, vdc->Block_Count) * vol->v_sectorsize; continue; } val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize; sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) * vol->v_sectorsize; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL) g_raid_md_ddf_start_disk(disk, vol); } pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_ddf_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_ddf_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_ddf_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct g_raid_volume *vol; struct ddf_meta *pdmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_vd_entry *vde; int i, j, k, num, have, need, cnt, spare; uint32_t val; char buf[17]; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; spare = -1; if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, pdmeta); else ddf_meta_update(&mdi->mdio_meta, pdmeta); num = GETCRNUM(pdmeta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(pdmeta, j); val = GET32D(pdmeta, vdc->Signature); if (val == DDF_SA_SIGNATURE && spare == -1) spare = 1; if (val != DDF_VDCR_SIGNATURE) continue; spare = 0; k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID); if (k < 0) continue; vde = &pdmeta->vdr->entry[k]; /* Look for volume with matching ID. */ vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID); if (vol == NULL) { ddf_meta_get_name(pdmeta, k, buf); vol = g_raid_create_volume(sc, buf, GET16D(pdmeta, vde->VD_Number)); pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_ddf_go, vol); mdi->mdio_starting++; } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ vmeta = &pv->pv_meta; ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started); } if (spare == 1) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_ddf_refill(sc); } TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; vmeta = &pv->pv_meta; if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL) continue; if (pv->pv_started) { if (g_raid_md_ddf_start_disk(disk, vol)) g_raid_md_write_ddf(md, vol, NULL, NULL); continue; } /* If we collected all needed disks - start array. */ need = 0; have = 0; for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) { if (vmeta->bvdc[k] == NULL) { need += GET16(vmeta, vdc->Primary_Element_Count); continue; } cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count); need += cnt; for (i = 0; i < cnt; i++) { val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]); if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL) have++; } } G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks", vol->v_name, have, need); if (have == need) g_raid_md_ddf_start(vol); } } static int g_raid_md_create_req_ddf(struct g_raid_md_object *md, struct g_class *mp, struct gctl_req *req, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; struct g_raid_md_ddf_object *mdi, *mdi1; char name[16]; const char *fmtopt; int be = 1; mdi = (struct g_raid_md_ddf_object *)md; fmtopt = gctl_get_asciiparam(req, "fmtopt"); if (fmtopt == NULL || strcasecmp(fmtopt, "BE") == 0) be = 1; else if (strcasecmp(fmtopt, "LE") == 0) be = 0; else { gctl_error(req, "Incorrect fmtopt argument."); return (G_RAID_MD_TASTE_FAIL); } /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi1->mdio_bigendian != be) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_ddf(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct ddf_meta meta; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct g_geom *geom; - int error, result, len, be; + int error, result, be; char name[16]; G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name); mdi = (struct g_raid_md_ddf_object *)md; pp = cp->provider; /* Read metadata from device. */ if (g_access(cp, 1, 0, 0) != 0) return (G_RAID_MD_TASTE_FAIL); g_topology_unlock(); bzero(&meta, sizeof(meta)); error = ddf_meta_read(cp, &meta); g_topology_lock(); g_access(cp, -1, 0, 0); if (error != 0) return (G_RAID_MD_TASTE_FAIL); be = meta.bigendian; /* Metadata valid. Print it. */ g_raid_md_ddf_print(&meta); /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi->mdio_bigendian != be) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); pd->pd_meta = meta; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", - rcp->provider->name, error); + g_raid_get_disk_info(disk); g_raid_md_ddf_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_ddf_pervolume *pv; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_ddf_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_ddf(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[DDF_MAX_DISKS_HARD]; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_sa_record *sa; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize; intmax_t *sizearg, *striparg; int i, numdisks, len, level, qual; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_ddf_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { gctl_error(req, "No free partitions " "on disk '%s'.", diskname); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; ddf_meta_unused_range(&pd->pd_meta, &offs[i], &esize); offs[i] *= pp->sectorsize; size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; ddf_meta_create(disk, &mdi->mdio_meta); if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_topology_unlock(); - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, GET64(&pd->pd_meta, pdr->entry[0].Configured_Size) * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta); pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID4 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else if (level == G_RAID_VOLUME_RL_RAID5R) { vol->v_mediasize = size * (numdisks - 1); vol->v_rotate_parity = 1024; } else if (level == G_RAID_VOLUME_RL_RAID6 || level == G_RAID_VOLUME_RL_RAID5E || level == G_RAID_VOLUME_RL_RAID5EE) vol->v_mediasize = size * (numdisks - 2); else if (level == G_RAID_VOLUME_RL_RAIDMDF) { if (numdisks < 5) vol->v_mdf_pdisks = 2; else vol->v_mdf_pdisks = 3; vol->v_mdf_polynomial = 0x11d; vol->v_mdf_method = 0x00; vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks); } else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = offs[i]; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_ddf_purge_disks(sc); g_raid_md_write_ddf(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_ddf(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); ddf_meta_create(disk, &mdi->mdio_meta); sa = ddf_meta_find_sa(&pd->pd_meta, 1); if (sa != NULL) { SET32D(&pd->pd_meta, sa->Signature, DDF_SA_SIGNATURE); SET8D(&pd->pd_meta, sa->Spare_Type, 0); SET16D(&pd->pd_meta, sa->Populated_SAEs, 0); SET16D(&pd->pd_meta, sa->MAX_SAE_Supported, (GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize - sizeof(struct ddf_sa_record)) / sizeof(struct ddf_sa_entry)); } if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_raid_md_write_ddf(md, NULL, NULL, NULL); g_raid_md_ddf_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_ddf(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_meta *gmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_sa_record *sa; uint64_t *val2; int i, j, pos, bvd, size; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; gmeta = &mdi->mdio_meta; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* * Clear disk flags to let only really needed ones to be reset. * Do it only if there are no volumes in starting state now, * as they can update disk statuses yet and we may kill innocent. */ if (mdi->mdio_starting == 0) { for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) & ~(DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)); if ((GET16(gmeta, pdr->entry[i].PD_State) & DDF_PDE_PFA) == 0) SET16(gmeta, pdr->entry[i].PD_State, 0); } } /* Generate/update new per-volume metadata. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; if (vol->v_stopping || !pv->pv_started) continue; vmeta = &pv->pv_meta; SET32(vmeta, vdc->Sequence_Number, GET32(vmeta, vdc->Sequence_Number) + 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) SET16(vmeta, vdc->Primary_Element_Count, 2); else SET16(vmeta, vdc->Primary_Element_Count, vol->v_disks_count); SET8(vmeta, vdc->Stripe_Size, ffs(vol->v_strip_size / vol->v_sectorsize) - 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) { SET8(vmeta, vdc->Primary_RAID_Level, DDF_VDCR_RAID1); SET8(vmeta, vdc->RLQ, 0); SET8(vmeta, vdc->Secondary_Element_Count, vol->v_disks_count / 2); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } else { SET8(vmeta, vdc->Primary_RAID_Level, vol->v_raid_level); SET8(vmeta, vdc->RLQ, vol->v_raid_level_qualifier); SET8(vmeta, vdc->Secondary_Element_Count, 1); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } SET8(vmeta, vdc->Secondary_Element_Seq, 0); SET64(vmeta, vdc->Block_Count, 0); SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize); SET16(vmeta, vdc->Block_Size, vol->v_sectorsize); SET8(vmeta, vdc->Rotate_Parity_count, fls(vol->v_rotate_parity) - 1); SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks); SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial, vol->v_mdf_polynomial); SET8(vmeta, vdc->MDF_Constant_Generation_Method, vol->v_mdf_method); SET16(vmeta, vde->VD_Number, vol->v_global_id); if (vol->v_state <= G_RAID_VOLUME_S_BROKEN) SET8(vmeta, vde->VD_State, DDF_VDE_FAILED); else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED); else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL) SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL); else SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL); if (vol->v_dirty || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0) SET8(vmeta, vde->VD_State, GET8(vmeta, vde->VD_State) | DDF_VDE_DIRTY); SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX ddf_meta_put_name(vmeta, vol->v_name); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; bvd = i / GET16(vmeta, vdc->Primary_Element_Count); pos = i % GET16(vmeta, vdc->Primary_Element_Count); disk = sd->sd_disk; if (disk != NULL) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (vmeta->bvdc[bvd] == NULL) { size = GET16(vmeta, hdr->Configuration_Record_Length) * vmeta->sectorsize; vmeta->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memset(vmeta->bvdc[bvd], 0xff, size); } memcpy(vmeta->bvdc[bvd], vmeta->vdc, sizeof(struct ddf_vdc_record)); SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd); SET64(vmeta, bvdc[bvd]->Block_Count, sd->sd_size / vol->v_sectorsize); SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos], GET32(&pd->pd_meta, pdd->PD_Reference)); val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); SET64P(vmeta, val2 + pos, sd->sd_offset / vol->v_sectorsize); } if (vmeta->bvdc[bvd] == NULL) continue; j = ddf_meta_find_pd(gmeta, NULL, GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos])); if (j < 0) continue; SET32(gmeta, pdr->entry[j].PD_Type, GET32(gmeta, pdr->entry[j].PD_Type) | DDF_PDE_PARTICIPATING); if (sd->sd_state == G_RAID_SUBDISK_S_NONE) SET32(gmeta, pdr->entry[j].PD_State, GET32(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_MISSING)); else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) SET32(gmeta, pdr->entry[j].PD_State, GET32(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) SET32(gmeta, pdr->entry[j].PD_State, GET32(gmeta, pdr->entry[j].PD_State) | DDF_PDE_REBUILD); else SET32(gmeta, pdr->entry[j].PD_State, GET32(gmeta, pdr->entry[j].PD_State) | DDF_PDE_ONLINE); } } /* Mark spare and failed disks as such. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; i = ddf_meta_find_pd(gmeta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); if (i < 0) continue; if (disk->d_state == G_RAID_DISK_S_FAILED) { SET32(gmeta, pdr->entry[i].PD_State, GET32(gmeta, pdr->entry[i].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); } if (disk->d_state != G_RAID_DISK_S_SPARE) continue; sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa == NULL || (GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_DEDICATED) == 0) { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_GLOBAL_SPARE); } else { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_CONFIG_SPARE); } SET32(gmeta, pdr->entry[i].PD_State, GET32(gmeta, pdr->entry[i].PD_State) | DDF_PDE_ONLINE); } /* Remove disks without "participating" flag (unused). */ for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; if ((GET16(gmeta, pdr->entry[i].PD_Type) & (DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)) != 0 || g_raid_md_ddf_get_disk(sc, NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL) j = i; else memset(&gmeta->pdr->entry[i], 0xff, sizeof(struct ddf_pd_entry)); } SET16(gmeta, pdr->Populated_PDEs, j + 1); /* Update per-disk metadata and write them. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; /* Update PDR. */ memcpy(pd->pd_meta.pdr, gmeta->pdr, GET32(&pd->pd_meta, hdr->pdr_length) * pd->pd_meta.sectorsize); /* Update VDR. */ SET16(&pd->pd_meta, vdr->Populated_VDEs, 0); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; i = ddf_meta_find_vd(&pd->pd_meta, pv->pv_meta.vde->VD_GUID); if (i < 0) i = ddf_meta_find_vd(&pd->pd_meta, NULL); if (i >= 0) memcpy(&pd->pd_meta.vdr->entry[i], pv->pv_meta.vde, sizeof(struct ddf_vd_entry)); } /* Update VDC. */ if (mdi->mdio_starting == 0) { /* Remove all VDCs to restore needed later. */ j = GETCRNUM(&pd->pd_meta); for (i = 0; i < j; i++) { vdc = GETVDCPTR(&pd->pd_meta, i); if (GET32D(&pd->pd_meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff); } } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { vol = sd->sd_volume; if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; vmeta = &pv->pv_meta; vdc = ddf_meta_find_vdc(&pd->pd_meta, vmeta->vde->VD_GUID); if (vdc == NULL) vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL); if (vdc != NULL) { bvd = sd->sd_pos / GET16(vmeta, vdc->Primary_Element_Count); memcpy(vdc, vmeta->bvdc[bvd], GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize); } } G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(disk)); g_raid_md_ddf_print(&pd->pd_meta); ddf_meta_write(disk->d_consumer, &pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_ddf(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_ddf_perdisk *pd; struct g_raid_subdisk *sd; int i; sc = md->mdo_softc; pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(tdisk)); i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED | DDF_PDE_PFA); if (tdisk->d_consumer != NULL) ddf_meta_write(tdisk->d_consumer, &pd->pd_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, tdisk); g_raid_md_ddf_refill(sc); return (0); } static int g_raid_md_free_disk_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_ddf_perdisk *pd; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; ddf_meta_free(&pd->pd_meta); free(pd, M_MD_DDF); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_ddf_object *mdi; struct g_raid_md_ddf_pervolume *pv; mdi = (struct g_raid_md_ddf_object *)md; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; ddf_vol_meta_free(&pv->pv_meta); if (!pv->pv_started) { pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); } free(pv, M_MD_DDF); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_ddf(struct g_raid_md_object *md) { struct g_raid_md_ddf_object *mdi; mdi = (struct g_raid_md_ddf_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } ddf_meta_free(&mdi->mdio_meta); return (0); } G_RAID_MD_DECLARE(ddf, "DDF"); Index: stable/8/sys/geom/raid/md_intel.c =================================================================== --- stable/8/sys/geom/raid/md_intel.c (revision 243678) +++ stable/8/sys/geom/raid/md_intel.c (revision 243679) @@ -1,2449 +1,2426 @@ /*- * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata"); struct intel_raid_map { uint32_t offset; uint32_t disk_sectors; uint32_t stripe_count; uint16_t strip_sectors; uint8_t status; #define INTEL_S_READY 0x00 #define INTEL_S_UNINITIALIZED 0x01 #define INTEL_S_DEGRADED 0x02 #define INTEL_S_FAILURE 0x03 uint8_t type; #define INTEL_T_RAID0 0x00 #define INTEL_T_RAID1 0x01 #define INTEL_T_RAID5 0x05 uint8_t total_disks; uint8_t total_domains; uint8_t failed_disk_num; uint8_t ddf; uint32_t offset_hi; uint32_t disk_sectors_hi; uint32_t stripe_count_hi; uint32_t filler_2[4]; uint32_t disk_idx[1]; /* total_disks entries. */ #define INTEL_DI_IDX 0x00ffffff #define INTEL_DI_RBLD 0x01000000 } __packed; struct intel_raid_vol { uint8_t name[16]; u_int64_t total_sectors __packed; uint32_t state; #define INTEL_ST_BOOTABLE 0x00000001 #define INTEL_ST_BOOT_DEVICE 0x00000002 #define INTEL_ST_READ_COALESCING 0x00000004 #define INTEL_ST_WRITE_COALESCING 0x00000008 #define INTEL_ST_LAST_SHUTDOWN_DIRTY 0x00000010 #define INTEL_ST_HIDDEN_AT_BOOT 0x00000020 #define INTEL_ST_CURRENTLY_HIDDEN 0x00000040 #define INTEL_ST_VERIFY_AND_FIX 0x00000080 #define INTEL_ST_MAP_STATE_UNINIT 0x00000100 #define INTEL_ST_NO_AUTO_RECOVERY 0x00000200 #define INTEL_ST_CLONE_N_GO 0x00000400 #define INTEL_ST_CLONE_MAN_SYNC 0x00000800 #define INTEL_ST_CNG_MASTER_DISK_NUM 0x00001000 uint32_t reserved; uint8_t migr_priority; uint8_t num_sub_vols; uint8_t tid; uint8_t cng_master_disk; uint16_t cache_policy; uint8_t cng_state; uint8_t cng_sub_state; uint32_t filler_0[10]; uint32_t curr_migr_unit; uint32_t checkpoint_id; uint8_t migr_state; uint8_t migr_type; #define INTEL_MT_INIT 0 #define INTEL_MT_REBUILD 1 #define INTEL_MT_VERIFY 2 #define INTEL_MT_GEN_MIGR 3 #define INTEL_MT_STATE_CHANGE 4 #define INTEL_MT_REPAIR 5 uint8_t dirty; uint8_t fs_state; uint16_t verify_errors; uint16_t bad_blocks; uint32_t curr_migr_unit_hi; uint32_t filler_1[3]; struct intel_raid_map map[1]; /* 2 entries if migr_state != 0. */ } __packed; struct intel_raid_disk { #define INTEL_SERIAL_LEN 16 uint8_t serial[INTEL_SERIAL_LEN]; uint32_t sectors; uint32_t id; uint32_t flags; #define INTEL_F_SPARE 0x01 #define INTEL_F_ASSIGNED 0x02 #define INTEL_F_FAILED 0x04 #define INTEL_F_ONLINE 0x08 uint32_t owner_cfg_num; uint32_t sectors_hi; uint32_t filler[3]; } __packed; struct intel_raid_conf { uint8_t intel_id[24]; #define INTEL_MAGIC "Intel Raid ISM Cfg Sig. " uint8_t version[6]; #define INTEL_VERSION_1000 "1.0.00" /* RAID0 */ #define INTEL_VERSION_1100 "1.1.00" /* RAID1 */ #define INTEL_VERSION_1200 "1.2.00" /* Many volumes */ #define INTEL_VERSION_1201 "1.2.01" /* 3 or 4 disks */ #define INTEL_VERSION_1202 "1.2.02" /* RAID5 */ #define INTEL_VERSION_1204 "1.2.04" /* 5 or 6 disks */ #define INTEL_VERSION_1206 "1.2.06" /* CNG */ #define INTEL_VERSION_1300 "1.3.00" /* Attributes */ uint8_t dummy_0[2]; uint32_t checksum; uint32_t config_size; uint32_t config_id; uint32_t generation; uint32_t error_log_size; uint32_t attributes; #define INTEL_ATTR_RAID0 0x00000001 #define INTEL_ATTR_RAID1 0x00000002 #define INTEL_ATTR_RAID10 0x00000004 #define INTEL_ATTR_RAID1E 0x00000008 #define INTEL_ATTR_RAID5 0x00000010 #define INTEL_ATTR_RAIDCNG 0x00000020 #define INTEL_ATTR_2TB 0x20000000 #define INTEL_ATTR_PM 0x40000000 #define INTEL_ATTR_CHECKSUM 0x80000000 uint8_t total_disks; uint8_t total_volumes; uint8_t dummy_2[2]; uint32_t filler_0[39]; struct intel_raid_disk disk[1]; /* total_disks entries. */ /* Here goes total_volumes of struct intel_raid_vol. */ } __packed; #define INTEL_MAX_MD_SIZE(ndisks) \ (sizeof(struct intel_raid_conf) + \ sizeof(struct intel_raid_disk) * (ndisks - 1) + \ sizeof(struct intel_raid_vol) * 2 + \ sizeof(struct intel_raid_map) * 2 + \ sizeof(uint32_t) * (ndisks - 1) * 4) struct g_raid_md_intel_perdisk { struct intel_raid_conf *pd_meta; int pd_disk_pos; struct intel_raid_disk pd_disk_meta; }; struct g_raid_md_intel_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; uint32_t mdio_generation; struct intel_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_intel; static g_raid_md_taste_t g_raid_md_taste_intel; static g_raid_md_event_t g_raid_md_event_intel; static g_raid_md_ctl_t g_raid_md_ctl_intel; static g_raid_md_write_t g_raid_md_write_intel; static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel; static g_raid_md_free_disk_t g_raid_md_free_disk_intel; static g_raid_md_free_t g_raid_md_free_intel; static kobj_method_t g_raid_md_intel_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_intel), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_intel), KOBJMETHOD(g_raid_md_event, g_raid_md_event_intel), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_intel), KOBJMETHOD(g_raid_md_write, g_raid_md_write_intel), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_intel), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_intel), KOBJMETHOD(g_raid_md_free, g_raid_md_free_intel), { 0, 0 } }; static struct g_raid_md_class g_raid_md_intel_class = { "Intel", g_raid_md_intel_methods, sizeof(struct g_raid_md_intel_object), .mdc_enable = 1, .mdc_priority = 100 }; static struct intel_raid_map * intel_get_map(struct intel_raid_vol *mvol, int i) { struct intel_raid_map *mmap; if (i > (mvol->migr_state ? 1 : 0)) return (NULL); mmap = &mvol->map[0]; for (; i > 0; i--) { mmap = (struct intel_raid_map *) &mmap->disk_idx[mmap->total_disks]; } return ((struct intel_raid_map *)mmap); } static struct intel_raid_vol * intel_get_volume(struct intel_raid_conf *meta, int i) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; if (i > 1) return (NULL); mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks]; for (; i > 0; i--) { mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0); mvol = (struct intel_raid_vol *) &mmap->disk_idx[mmap->total_disks]; } return (mvol); } static off_t intel_get_map_offset(struct intel_raid_map *mmap) { off_t offset = (off_t)mmap->offset_hi << 32; offset += mmap->offset; return (offset); } static void intel_set_map_offset(struct intel_raid_map *mmap, off_t offset) { mmap->offset = offset & 0xffffffff; mmap->offset_hi = offset >> 32; } static off_t intel_get_map_disk_sectors(struct intel_raid_map *mmap) { off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32; disk_sectors += mmap->disk_sectors; return (disk_sectors); } static void intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors) { mmap->disk_sectors = disk_sectors & 0xffffffff; mmap->disk_sectors_hi = disk_sectors >> 32; } static void intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count) { mmap->stripe_count = stripe_count & 0xffffffff; mmap->stripe_count_hi = stripe_count >> 32; } static off_t intel_get_disk_sectors(struct intel_raid_disk *disk) { off_t sectors = (off_t)disk->sectors_hi << 32; sectors += disk->sectors; return (sectors); } static void intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors) { disk->sectors = sectors & 0xffffffff; disk->sectors_hi = sectors >> 32; } static off_t intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol) { off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32; curr_migr_unit += vol->curr_migr_unit; return (curr_migr_unit); } static void intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit) { vol->curr_migr_unit = curr_migr_unit & 0xffffffff; vol->curr_migr_unit_hi = curr_migr_unit >> 32; } static void g_raid_md_intel_print(struct intel_raid_conf *meta) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; int i, j, k; if (g_raid_debug < 1) return; printf("********* ATA Intel MatrixRAID Metadata *********\n"); printf("intel_id <%.24s>\n", meta->intel_id); printf("version <%.6s>\n", meta->version); printf("checksum 0x%08x\n", meta->checksum); printf("config_size 0x%08x\n", meta->config_size); printf("config_id 0x%08x\n", meta->config_id); printf("generation 0x%08x\n", meta->generation); printf("attributes 0x%08x\n", meta->attributes); printf("total_disks %u\n", meta->total_disks); printf("total_volumes %u\n", meta->total_volumes); printf("DISK# serial disk_sectors disk_sectors_hi disk_id flags\n"); for (i = 0; i < meta->total_disks; i++ ) { printf(" %d <%.16s> %u %u 0x%08x 0x%08x\n", i, meta->disk[i].serial, meta->disk[i].sectors, meta->disk[i].sectors_hi, meta->disk[i].id, meta->disk[i].flags); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); printf(" ****** Volume %d ******\n", i); printf(" name %.16s\n", mvol->name); printf(" total_sectors %ju\n", mvol->total_sectors); printf(" state %u\n", mvol->state); printf(" reserved %u\n", mvol->reserved); printf(" curr_migr_unit %u\n", mvol->curr_migr_unit); printf(" curr_migr_unit_hi %u\n", mvol->curr_migr_unit_hi); printf(" checkpoint_id %u\n", mvol->checkpoint_id); printf(" migr_state %u\n", mvol->migr_state); printf(" migr_type %u\n", mvol->migr_type); printf(" dirty %u\n", mvol->dirty); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { printf(" *** Map %d ***\n", j); mmap = intel_get_map(mvol, j); printf(" offset %u\n", mmap->offset); printf(" offset_hi %u\n", mmap->offset_hi); printf(" disk_sectors %u\n", mmap->disk_sectors); printf(" disk_sectors_hi %u\n", mmap->disk_sectors_hi); printf(" stripe_count %u\n", mmap->stripe_count); printf(" stripe_count_hi %u\n", mmap->stripe_count_hi); printf(" strip_sectors %u\n", mmap->strip_sectors); printf(" status %u\n", mmap->status); printf(" type %u\n", mmap->type); printf(" total_disks %u\n", mmap->total_disks); printf(" total_domains %u\n", mmap->total_domains); printf(" failed_disk_num %u\n", mmap->failed_disk_num); printf(" ddf %u\n", mmap->ddf); printf(" disk_idx "); for (k = 0; k < mmap->total_disks; k++) printf(" 0x%08x", mmap->disk_idx[k]); printf("\n"); } } printf("=================================================\n"); } static struct intel_raid_conf * intel_meta_copy(struct intel_raid_conf *meta) { struct intel_raid_conf *nmeta; nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); memcpy(nmeta, meta, meta->config_size); return (nmeta); } static int intel_meta_find_disk(struct intel_raid_conf *meta, char *serial) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (strncmp(meta->disk[pos].serial, serial, INTEL_SERIAL_LEN) == 0) return (pos); } return (-1); } static struct intel_raid_conf * intel_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap; char *buf; int error, i, j, k, left, size; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct intel_raid_conf *)buf; /* Check if this is an Intel RAID struct */ if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) { G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 65536 || meta->config_size < sizeof(struct intel_raid_conf)) { G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } size = meta->config_size; meta = malloc(size, M_MD_INTEL, M_WAITOK); memcpy(meta, buf, min(size, pp->sectorsize)); g_free(buf); /* Read all the rest, if needed. */ if (meta->config_size > pp->sectorsize) { left = (meta->config_size - 1) / pp->sectorsize; buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (2 + left), pp->sectorsize * left, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read remaining metadata" " part from %s (error=%d).", pp->name, error); free(meta, M_MD_INTEL); return (NULL); } memcpy(((char *)meta) + pp->sectorsize, buf, pp->sectorsize * left); g_free(buf); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } checksum -= meta->checksum; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name); free(meta, M_MD_INTEL); return (NULL); } /* Validate metadata size. */ size = sizeof(struct intel_raid_conf) + sizeof(struct intel_raid_disk) * (meta->total_disks - 1) + sizeof(struct intel_raid_vol) * meta->total_volumes; if (size > meta->config_size) { badsize: G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d", meta->config_size, size); free(meta, M_MD_INTEL); return (NULL); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; if (mvol->migr_state) { size += sizeof(struct intel_raid_map); if (size > meta->config_size) goto badsize; mmap = intel_get_map(mvol, 1); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; } } /* Validate disk indexes. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { mmap = intel_get_map(mvol, j); for (k = 0; k < mmap->total_disks; k++) { if ((mmap->disk_idx[k] & INTEL_DI_IDX) > meta->total_disks) { G_RAID_DEBUG(1, "Intel metadata disk" " index %d too big (>%d)", mmap->disk_idx[k] & INTEL_DI_IDX, meta->total_disks); free(meta, M_MD_INTEL); return (NULL); } } } } /* Validate migration types. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); if (mvol->migr_state && mvol->migr_type != INTEL_MT_INIT && mvol->migr_type != INTEL_MT_REBUILD && mvol->migr_type != INTEL_MT_VERIFY && mvol->migr_type != INTEL_MT_REPAIR) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " migration type %d", mvol->migr_type); free(meta, M_MD_INTEL); return (NULL); } } return (meta); } static int intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i, sectors; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } meta->checksum = checksum; /* Create and fill buffer. */ sectors = (meta->config_size + pp->sectorsize - 1) / pp->sectorsize; buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); if (sectors > 1) { memcpy(buf, ((char *)meta) + pp->sectorsize, (sectors - 1) * pp->sectorsize); } memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize); error = g_write_data(cp, pp->mediasize - pp->sectorsize * (1 + sectors), buf, pp->sectorsize * sectors); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d) { struct intel_raid_conf *meta; int error; /* Fill anchor and single disk. */ meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); memcpy(&meta->version[0], INTEL_VERSION_1000, sizeof(INTEL_VERSION_1000) - 1); meta->config_size = INTEL_MAX_MD_SIZE(1); meta->config_id = arc4random(); meta->generation = 1; meta->total_disks = 1; meta->disk[0] = *d; error = intel_meta_write(cp, meta); free(meta, M_MD_INTEL); return (error); } static struct g_raid_disk * g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_intel_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_intel_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (!force && disks > 6) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static struct g_raid_volume * g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id) { struct g_raid_volume *mvol; TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) { if ((intptr_t)(mvol->v_md_data) == id) break; } return (mvol); } static int g_raid_md_intel_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd, *oldpd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by it's serial. */ disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* Failed stale disk is useless for us. */ if (pd->pd_disk_meta.flags & INTEL_F_FAILED) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { off_t disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (sd->sd_offset + sd->sd_size + 4096 > disk_sectors * 512) { G_RAID_DEBUG1(1, sc, "Disk too small (%llu < %llu)", (unsigned long long) disk_sectors * 512, (unsigned long long) sd->sd_offset + sd->sd_size + 4096); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (pd->pd_disk_meta.flags & INTEL_F_SPARE) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_intel_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); /* Update global metadata just in case. */ memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta, sizeof(struct intel_raid_disk)); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else if (meta->disk[disk_pos].flags & INTEL_F_SPARE) g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { mvol = intel_get_volume(meta, (uintptr_t)(sd->sd_volume->v_md_data)); mmap0 = intel_get_map(mvol, 0); if (mvol->migr_state) mmap1 = intel_get_map(mvol, 1); else mmap1 = mmap0; if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) { /* Failed disk, almost useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (mvol->migr_state == 0) { if (mmap0->status == INTEL_S_UNINITIALIZED) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mvol->dirty) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_INIT || mvol->migr_type == INTEL_MT_REBUILD) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->dirty) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_VERIFY || mvol->migr_type == INTEL_MT_REPAIR) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->dirty) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < meta->total_disks); } return (resurrection); } static void g_disk_md_intel_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_INTEL); } static void g_raid_md_intel_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *meta; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == meta->total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, meta->total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) { g_raid_md_write_intel(md, NULL, NULL, NULL); meta = mdi->mdio_meta; } /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < meta->total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_INTEL, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_intel_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_intel_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, j, disk_pos; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); vol = g_raid_create_volume(sc, mvol->name, -1); vol->v_md_data = (void *)(intptr_t)i; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (mmap->type == INTEL_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (mmap->type == INTEL_T_RAID1 && mmap->total_domains >= 2 && mmap->total_domains <= mmap->total_disks) { /* Assume total_domains is correct. */ if (mmap->total_domains == mmap->total_disks) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID1) { /* total_domains looks wrong. */ if (mmap->total_disks <= 2) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ vol->v_disks_count = mmap->total_disks; vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ } g_raid_start_volume(vol); } /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_meta = meta->disk[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); for (j = 0; j < mmap->total_disks; j++) { if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos) break; } if (j == mmap->total_disks) continue; vol = g_raid_md_intel_get_volume(sc, i); sd = &vol->v_subdisks[j]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_intel_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_intel_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *pdmeta; struct g_raid_md_intel_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_intel_start_disk(disk)) g_raid_md_write_intel(md, NULL, NULL, NULL); } else { /* If we haven't started yet - check metadata freshness. */ if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = intel_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks) g_raid_md_intel_start(sc); } } static void g_raid_intel_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; char name[16]; mdi = (struct g_raid_md_intel_object *)md; mdi->mdio_config_id = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } /* * Return the last N characters of the serial label. The Linux and * ataraid(7) code always uses the last 16 characters of the label to * store into the Intel meta format. Generalize this to N characters * since that's easy. Labels can be up to 20 characters for SATA drives * and up 251 characters for SAS drives. Since intel controllers don't * support SAS drives, just stick with the SATA limits for stack friendliness. */ static int g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen) { char serial_buffer[24]; int len, error; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer); if (error != 0) return (error); len = strlen(serial_buffer); if (len > serlen) len -= serlen; else len = 0; strncpy(serial, serial_buffer + len, serlen); return (0); } static int g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_intel_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct intel_raid_conf *meta; struct g_raid_md_intel_perdisk *pd; struct g_geom *geom; int error, disk_pos, result, spare, len; char serial[INTEL_SERIAL_LEN]; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name); mdi = (struct g_raid_md_intel_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; vendor = 0xffff; disk_pos = 0; if (g_access(cp, 1, 0, 0) != 0) return (G_RAID_MD_TASTE_FAIL); g_topology_unlock(); error = g_raid_md_get_label(cp, serial, sizeof(serial)); if (error != 0) { G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).", pp->name, error); goto fail2; } len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = intel_meta_read(cp); g_topology_lock(); g_access(cp, -1, 0, 0); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor != 0x8086) { G_RAID_DEBUG(1, "Intel vendor mismatch 0x%04x != 0x8086", vendor); } else { G_RAID_DEBUG(1, "No Intel metadata, forcing spare."); spare = 2; goto search; } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = intel_meta_find_disk(meta, serial); if (disk_pos < 0) { G_RAID_DEBUG(1, "Intel serial '%s' not found", serial); goto fail1; } if (intel_get_disk_sectors(&meta->disk[disk_pos]) != (pp->mediasize / pp->sectorsize)) { G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju", intel_get_disk_sectors(&meta->disk[disk_pos]), (off_t)(pp->mediasize / pp->sectorsize)); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_intel_print(meta); G_RAID_DEBUG(1, "Intel disk position %d", disk_pos); spare = meta->disk[disk_pos].flags & INTEL_F_SPARE; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_intel_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == meta->config_id) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = meta->config_id; snprintf(name, sizeof(name), "Intel-%08x", meta->config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_intel_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-Intel"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_meta = meta; pd->pd_disk_pos = -1; if (spare == 2) { memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; } else { pd->pd_disk_meta = meta->disk[disk_pos]; } disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", - rcp->provider->name, error); + g_raid_get_disk_info(disk); g_raid_md_intel_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail2: g_topology_lock(); g_access(cp, -1, 0, 0); fail1: free(meta, M_MD_INTEL); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_intel(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_intel_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_intel(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16], serial[INTEL_SERIAL_LEN]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t off, size, sectorsize, strip, disk_sectors; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) { strcpy(&pd->pd_disk_meta.serial[0], "NONE"); pd->pd_disk_meta.id = 0xffffffff; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; continue; } cp->private = disk; g_topology_unlock(); error = g_raid_md_get_label(cp, &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); error = -8; break; } - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { if (*nargs != 3) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } /* Look for existing volumes. */ i = 0; vol1 = NULL; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { vol1 = vol; i++; } if (i > 1) { gctl_error(req, "Maximum two volumes supported."); return (-6); } if (vol1 == NULL) { gctl_error(req, "At least one volume must exist."); return (-7); } numdisks = vol1->v_disks_count; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Collect info about present disks. */ size = 0x7fffffffffffffffllu; sectorsize = 512; for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; pd = (struct g_raid_md_intel_perdisk *) disk->d_md_data; disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (disk_sectors * 512 < size) size = disk_sectors * 512; if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && disk->d_consumer->provider->sectorsize > sectorsize) { sectorsize = disk->d_consumer->provider->sectorsize; } } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Decide insert before or after. */ sd = &vol1->v_subdisks[0]; if (sd->sd_offset > size - (sd->sd_offset + sd->sd_size)) { off = 0; size = sd->sd_offset; } else { off = sd->sd_offset + sd->sd_size; size = size - (sd->sd_offset + sd->sd_size); } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round offset up to strip. */ if (off % strip != 0) { size -= strip - off % strip; off += strip - off % strip; } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)i; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = off; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (disk->d_state == G_RAID_DISK_S_ACTIVE) { if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } } /* Write metadata based on created entities. */ g_raid_md_write_intel(md, NULL, NULL, NULL); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_write_intel(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_intel(md, NULL, disk); continue; } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ intel_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); /* Read disk serial. */ error = g_raid_md_get_label(cp, &serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); g_raid_kill_consumer(sc, cp); error = -7; break; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = -1; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); memcpy(&pd->pd_disk_meta.serial[0], &serial[0], INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; /* Welcome the "new" disk. */ update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { intel_meta_write_spare(cp, &pd->pd_disk_meta); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_intel(md, NULL, NULL, NULL); return (error); } return (-100); } static int g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; off_t sectorsize = 512, pos; const char *version, *cv; int vi, sdi, numdisks, len, state, stale; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* Count number of disks. */ numdisks = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; numdisks++; if (disk->d_state == G_RAID_DISK_S_ACTIVE) { pd->pd_disk_meta.flags = INTEL_F_ONLINE | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_FAILED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED; } else { pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; if (pd->pd_disk_meta.id != 0xffffffff) { pd->pd_disk_meta.id = 0xffffffff; len = strlen(pd->pd_disk_meta.serial); len = min(len, INTEL_SERIAL_LEN - 3); strcpy(pd->pd_disk_meta.serial + len, ":0"); } } } /* Fill anchor and disks. */ meta = malloc(INTEL_MAX_MD_SIZE(numdisks), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); meta->config_size = INTEL_MAX_MD_SIZE(numdisks); meta->config_id = mdi->mdio_config_id; meta->generation = mdi->mdio_generation; meta->attributes = INTEL_ATTR_CHECKSUM; meta->total_disks = numdisks; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta; } /* Fill volumes and maps. */ vi = 0; version = INTEL_VERSION_1000; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; mvol = intel_get_volume(meta, vi); /* New metadata may have different volumes order. */ vol->v_md_data = (void *)(intptr_t)vi; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_disk != NULL) break; } if (sdi >= vol->v_disks_count) panic("No any filled subdisk in volume"); if (vol->v_mediasize >= 0x20000000000llu) meta->attributes |= INTEL_ATTR_2TB; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->attributes |= INTEL_ATTR_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->attributes |= INTEL_ATTR_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->attributes |= INTEL_ATTR_RAID5; else meta->attributes |= INTEL_ATTR_RAID10; if (meta->attributes & INTEL_ATTR_2TB) cv = INTEL_VERSION_1300; // else if (dev->status == DEV_CLONE_N_GO) // cv = INTEL_VERSION_1206; else if (vol->v_disks_count > 4) cv = INTEL_VERSION_1204; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) cv = INTEL_VERSION_1202; else if (vol->v_disks_count > 2) cv = INTEL_VERSION_1201; else if (vi > 0) cv = INTEL_VERSION_1200; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) cv = INTEL_VERSION_1100; else cv = INTEL_VERSION_1000; if (strcmp(cv, version) > 0) version = cv; strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name)); mvol->total_sectors = vol->v_mediasize / sectorsize; /* Check for any recovery in progress. */ state = G_RAID_SUBDISK_S_ACTIVE; pos = 0x7fffffffffffffffllu; stale = 0; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_REBUILD; else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC && state != G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_RESYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_STALE) stale = 1; if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos < pos) pos = sd->sd_rebuild_pos; } if (state == G_RAID_SUBDISK_S_REBUILD) { mvol->migr_state = 1; mvol->migr_type = INTEL_MT_REBUILD; } else if (state == G_RAID_SUBDISK_S_RESYNC) { mvol->migr_state = 1; /* mvol->migr_type = INTEL_MT_REPAIR; */ mvol->migr_type = INTEL_MT_VERIFY; mvol->state |= INTEL_ST_VERIFY_AND_FIX; } else mvol->migr_state = 0; mvol->dirty = (vol->v_dirty || stale); mmap0 = intel_get_map(mvol, 0); /* Write map / common part of two maps. */ intel_set_map_offset(mmap0, sd->sd_offset / sectorsize); intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize); mmap0->strip_sectors = vol->v_strip_size / sectorsize; if (vol->v_state == G_RAID_VOLUME_S_BROKEN) mmap0->status = INTEL_S_FAILURE; else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED) mmap0->status = INTEL_S_DEGRADED; else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) == g_raid_nsubdisks(vol, -1)) mmap0->status = INTEL_S_UNINITIALIZED; else mmap0->status = INTEL_S_READY; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) mmap0->type = INTEL_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->type = INTEL_T_RAID1; else mmap0->type = INTEL_T_RAID5; mmap0->total_disks = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) mmap0->total_domains = vol->v_disks_count; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->total_domains = 2; else mmap0->total_domains = 1; intel_set_map_stripe_count(mmap0, sd->sd_size / vol->v_strip_size / mmap0->total_domains); mmap0->failed_disk_num = 0xff; mmap0->ddf = 1; /* If there are two maps - copy common and update. */ if (mvol->migr_state) { intel_set_vol_curr_migr_unit(mvol, pos / vol->v_strip_size / mmap0->total_domains); mmap1 = intel_get_map(mvol, 1); memcpy(mmap1, mmap0, sizeof(struct intel_raid_map)); mmap0->status = INTEL_S_READY; } else mmap1 = NULL; /* Write disk indexes and put rebuild flags. */ for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; pd = (struct g_raid_md_intel_perdisk *) sd->sd_disk->d_md_data; mmap0->disk_idx[sdi] = pd->pd_disk_pos; if (mvol->migr_state) mmap1->disk_idx[sdi] = pd->pd_disk_pos; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && sd->sd_state != G_RAID_SUBDISK_S_STALE && sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) { mmap0->disk_idx[sdi] |= INTEL_DI_RBLD; if (mvol->migr_state) mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } if ((sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) && mmap0->failed_disk_num == 0xff) { mmap0->failed_disk_num = sdi; if (mvol->migr_state) mmap1->failed_disk_num = sdi; } } vi++; } meta->total_volumes = vi; if (strcmp(version, INTEL_VERSION_1300) != 0) meta->attributes &= INTEL_ATTR_CHECKSUM; memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1); /* We are done. Print meta data and store them to disks. */ g_raid_md_intel_print(meta); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } pd->pd_meta = intel_meta_copy(meta); intel_meta_write(disk->d_consumer, meta); } return (0); } static int g_raid_md_fail_disk_intel(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED; pd->pd_disk_meta.flags = INTEL_F_FAILED; g_raid_md_intel_print(mdi->mdio_meta); if (tdisk->d_consumer) intel_meta_write(tdisk->d_consumer, mdi->mdio_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } static int g_raid_md_free_disk_intel(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_intel_perdisk *pd; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } free(pd, M_MD_INTEL); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_intel(struct g_raid_md_object *md) { struct g_raid_md_intel_object *mdi; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(intel, "Intel"); Index: stable/8/sys/geom/raid/md_jmicron.c =================================================================== --- stable/8/sys/geom/raid/md_jmicron.c (revision 243678) +++ stable/8/sys/geom/raid/md_jmicron.c (revision 243679) @@ -1,1585 +1,1562 @@ /*- * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_JMICRON, "md_jmicron_data", "GEOM_RAID JMicron metadata"); #define JMICRON_MAX_DISKS 8 #define JMICRON_MAX_SPARE 2 struct jmicron_raid_conf { u_int8_t signature[2]; #define JMICRON_MAGIC "JM" u_int16_t version; #define JMICRON_VERSION 0x0001 u_int16_t checksum; u_int8_t filler_1[10]; u_int32_t disk_id; u_int32_t offset; u_int32_t disk_sectors_high; u_int16_t disk_sectors_low; u_int8_t filler_2[2]; u_int8_t name[16]; u_int8_t type; #define JMICRON_T_RAID0 0 #define JMICRON_T_RAID1 1 #define JMICRON_T_RAID01 2 #define JMICRON_T_CONCAT 3 #define JMICRON_T_RAID5 5 u_int8_t stripe_shift; u_int16_t flags; #define JMICRON_F_READY 0x0001 #define JMICRON_F_BOOTABLE 0x0002 #define JMICRON_F_BADSEC 0x0004 #define JMICRON_F_ACTIVE 0x0010 #define JMICRON_F_UNSYNC 0x0020 #define JMICRON_F_NEWEST 0x0040 u_int8_t filler_3[4]; u_int32_t spare[JMICRON_MAX_SPARE]; u_int32_t disks[JMICRON_MAX_DISKS]; #define JMICRON_DISK_MASK 0xFFFFFFF0 #define JMICRON_SEG_MASK 0x0000000F u_int8_t filler_4[32]; u_int8_t filler_5[384]; }; struct g_raid_md_jmicron_perdisk { struct jmicron_raid_conf *pd_meta; int pd_disk_pos; int pd_disk_id; off_t pd_disk_size; }; struct g_raid_md_jmicron_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; struct jmicron_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_jmicron; static g_raid_md_taste_t g_raid_md_taste_jmicron; static g_raid_md_event_t g_raid_md_event_jmicron; static g_raid_md_ctl_t g_raid_md_ctl_jmicron; static g_raid_md_write_t g_raid_md_write_jmicron; static g_raid_md_fail_disk_t g_raid_md_fail_disk_jmicron; static g_raid_md_free_disk_t g_raid_md_free_disk_jmicron; static g_raid_md_free_t g_raid_md_free_jmicron; static kobj_method_t g_raid_md_jmicron_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_jmicron), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_jmicron), KOBJMETHOD(g_raid_md_event, g_raid_md_event_jmicron), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_jmicron), KOBJMETHOD(g_raid_md_write, g_raid_md_write_jmicron), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_jmicron), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_jmicron), KOBJMETHOD(g_raid_md_free, g_raid_md_free_jmicron), { 0, 0 } }; static struct g_raid_md_class g_raid_md_jmicron_class = { "JMicron", g_raid_md_jmicron_methods, sizeof(struct g_raid_md_jmicron_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_jmicron_print(struct jmicron_raid_conf *meta) { int k; if (g_raid_debug < 1) return; printf("********* ATA JMicron RAID Metadata *********\n"); printf("signature <%c%c>\n", meta->signature[0], meta->signature[1]); printf("version %04x\n", meta->version); printf("checksum 0x%04x\n", meta->checksum); printf("disk_id 0x%08x\n", meta->disk_id); printf("offset 0x%08x\n", meta->offset); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_sectors_low 0x%04x\n", meta->disk_sectors_low); printf("name <%.16s>\n", meta->name); printf("type %d\n", meta->type); printf("stripe_shift %d\n", meta->stripe_shift); printf("flags %04x\n", meta->flags); printf("spare "); for (k = 0; k < JMICRON_MAX_SPARE; k++) printf(" 0x%08x", meta->spare[k]); printf("\n"); printf("disks "); for (k = 0; k < JMICRON_MAX_DISKS; k++) printf(" 0x%08x", meta->disks[k]); printf("\n"); printf("=================================================\n"); } static struct jmicron_raid_conf * jmicron_meta_copy(struct jmicron_raid_conf *meta) { struct jmicron_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int jmicron_meta_total_disks(struct jmicron_raid_conf *meta) { int pos; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if (meta->disks[pos] == 0) break; } return (pos); } static int jmicron_meta_total_spare(struct jmicron_raid_conf *meta) { int pos, n; n = 0; for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if (meta->spare[pos] != 0) n++; } return (n); } /* * Generate fake Configuration ID based on disk IDs. * Note: it will change after each disk set change. */ static uint32_t jmicron_meta_config_id(struct jmicron_raid_conf *meta) { int pos; uint32_t config_id; config_id = 0; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) config_id += meta->disks[pos] << pos; return (config_id); } static void jmicron_meta_get_name(struct jmicron_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void jmicron_meta_put_name(struct jmicron_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static int jmicron_meta_find_disk(struct jmicron_raid_conf *meta, uint32_t id) { int pos; id &= JMICRON_DISK_MASK; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if ((meta->disks[pos] & JMICRON_DISK_MASK) == id) return (pos); } for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if ((meta->spare[pos] & JMICRON_DISK_MASK) == id) return (-3); } return (-1); } static struct jmicron_raid_conf * jmicron_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct jmicron_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct jmicron_raid_conf *)buf; /* Check if this is an JMicron RAID struct */ if (strncmp(meta->signature, JMICRON_MAGIC, strlen(JMICRON_MAGIC))) { G_RAID_DEBUG(1, "JMicron signature check failed on %s", pp->name); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "JMicron checksum check failed on %s", pp->name); free(meta, M_MD_JMICRON); return (NULL); } return (meta); } static int jmicron_meta_write(struct g_consumer *cp, struct jmicron_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static int jmicron_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static struct g_raid_disk * g_raid_md_jmicron_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_jmicron_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_jmicron_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); if (!force) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); if (!force) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_jmicron_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd, *oldpd; struct jmicron_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by it's serial. */ if (pd->pd_meta != NULL) disk_pos = jmicron_meta_find_disk(meta, pd->pd_disk_id); else disk_pos = -1; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_jmicron_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* Update global metadata just in case. */ meta->disks[disk_pos] = pd->pd_disk_id; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes/offsets, * especially in concat mode. Update. */ if (!resurrection) { sd->sd_offset = (off_t)pd->pd_meta->offset * 16 * 512; //ZZZ sd->sd_size = (((off_t)pd->pd_meta->disk_sectors_high << 16) + pd->pd_meta->disk_sectors_low) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((meta->flags & JMICRON_F_BADSEC) != 0 && (pd->pd_meta->flags & JMICRON_F_BADSEC) == 0) { /* Cold-inserted or rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->flags & JMICRON_F_UNSYNC) { /* Dirty or resyncing disk.. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_jmicron_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_JMICRON); } static void g_raid_md_jmicron_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_JMICRON, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_jmicron_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_jmicron_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ jmicron_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); size = ((off_t)meta->disk_sectors_high << 16) + meta->disk_sectors_low; size *= 512; //ZZZ vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == JMICRON_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; vol->v_mediasize = size * mdi->mdio_total_disks; } else if (meta->type == JMICRON_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; vol->v_mediasize = size; } else if (meta->type == JMICRON_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_mediasize = size * mdi->mdio_total_disks / 2; } else if (meta->type == JMICRON_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; vol->v_mediasize = 0; } else if (meta->type == JMICRON_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; vol->v_mediasize = size * (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_mediasize = 0; } vol->v_strip_size = 1024 << meta->stripe_shift; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = (off_t)meta->offset * 16 * 512; //ZZZ sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_id = meta->disks[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_jmicron_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_jmicron_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct jmicron_raid_conf *pdmeta; struct g_raid_md_jmicron_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_jmicron_start_disk(disk)) g_raid_md_write_jmicron(md, NULL, NULL, NULL); } else { /* * If we haven't started yet - update common metadata * to get subdisks details, avoiding data from spare disks. */ if (mdi->mdio_meta == NULL || jmicron_meta_find_disk(mdi->mdio_meta, mdi->mdio_meta->disk_id) == -3) { if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = jmicron_meta_copy(pdmeta); mdi->mdio_total_disks = jmicron_meta_total_disks(pdmeta); } mdi->mdio_meta->flags |= pdmeta->flags & JMICRON_F_BADSEC; mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d+%d up)", mdi->mdio_disks_present, mdi->mdio_total_disks, jmicron_meta_total_spare(mdi->mdio_meta)); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks + jmicron_meta_total_spare(mdi->mdio_meta)) g_raid_md_jmicron_start(sc); } } static void g_raid_jmicron_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_jmicron_object *mdi; char name[16]; mdi = (struct g_raid_md_jmicron_object *)md; mdi->mdio_config_id = arc4random(); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_jmicron_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct jmicron_raid_conf *meta; struct g_raid_md_jmicron_perdisk *pd; struct g_geom *geom; - int error, disk_pos, result, spare, len; + int disk_pos, result, spare, len; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting JMicron on %s", cp->provider->name); mdi = (struct g_raid_md_jmicron_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; vendor = 0xffff; if (g_access(cp, 1, 0, 0) != 0) return (G_RAID_MD_TASTE_FAIL); g_topology_unlock(); len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = jmicron_meta_read(cp); g_topology_lock(); g_access(cp, -1, 0, 0); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x197b) { G_RAID_DEBUG(1, "No JMicron metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "JMicron vendor mismatch 0x%04x != 0x197b", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = jmicron_meta_find_disk(meta, meta->disk_id); if (disk_pos == -1) { G_RAID_DEBUG(1, "JMicron disk_id %08x not found", meta->disk_id); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_jmicron_print(meta); G_RAID_DEBUG(1, "JMicron disk position %d", disk_pos); spare = (disk_pos == -2) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_jmicron_object *)sc->sc_md; if (spare == 2) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == jmicron_meta_config_id(meta)) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = jmicron_meta_config_id(meta); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_jmicron_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-JMicron"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; } else { pd->pd_disk_pos = -1; pd->pd_disk_id = meta->disk_id; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", - rcp->provider->name, error); + g_raid_get_disk_info(disk); g_raid_md_jmicron_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_JMICRON); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_jmicron_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_jmicron(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_jmicron_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) jmicron_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_jmicron(md, NULL, disk); continue; } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ jmicron_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_jmicron(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK | M_ZERO); strncpy(meta->signature, JMICRON_MAGIC, 2); meta->version = JMICRON_VERSION; jmicron_meta_put_name(meta, vol->v_name); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = JMICRON_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = JMICRON_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = JMICRON_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = JMICRON_T_CONCAT; else meta->type = JMICRON_T_RAID5; meta->stripe_shift = fls(vol->v_strip_size / 2048); meta->flags = JMICRON_F_READY | JMICRON_F_BOOTABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == NULL || sd->sd_disk->d_md_data == NULL) meta->disks[i] = 0xffffffff; else { pd = (struct g_raid_md_jmicron_perdisk *) sd->sd_disk->d_md_data; meta->disks[i] = pd->pd_disk_id; } if (sd->sd_state < G_RAID_SUBDISK_S_STALE) meta->flags |= JMICRON_F_BADSEC; if (vol->v_dirty) meta->flags |= JMICRON_F_UNSYNC; } /* Put spares to their slots. */ spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_SPARE) continue; meta->spare[spares] = pd->pd_disk_id; if (++spares >= 2) break; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } pd->pd_meta = jmicron_meta_copy(meta); pd->pd_meta->disk_id = pd->pd_disk_id; if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { pd->pd_meta->offset = (sd->sd_offset / 512) / 16; pd->pd_meta->disk_sectors_high = (sd->sd_size / 512) >> 16; pd->pd_meta->disk_sectors_low = (sd->sd_size / 512) & 0xffff; if (sd->sd_state < G_RAID_SUBDISK_S_STALE) pd->pd_meta->flags &= ~JMICRON_F_BADSEC; else if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) pd->pd_meta->flags |= JMICRON_F_UNSYNC; } G_RAID_DEBUG(1, "Writing JMicron metadata to %s", g_raid_get_diskname(disk)); g_raid_md_jmicron_print(pd->pd_meta); jmicron_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_jmicron(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_jmicron_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_jmicron_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); if (tdisk->d_consumer) jmicron_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } static int g_raid_md_free_disk_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_jmicron_perdisk *pd; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } free(pd, M_MD_JMICRON); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_jmicron(struct g_raid_md_object *md) { struct g_raid_md_jmicron_object *mdi; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(jmicron, "JMicron"); Index: stable/8/sys/geom/raid/md_nvidia.c =================================================================== --- stable/8/sys/geom/raid/md_nvidia.c (revision 243678) +++ stable/8/sys/geom/raid/md_nvidia.c (revision 243679) @@ -1,1604 +1,1581 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_NVIDIA, "md_nvidia_data", "GEOM_RAID NVIDIA metadata"); struct nvidia_raid_conf { uint8_t nvidia_id[8]; #define NVIDIA_MAGIC "NVIDIA " uint32_t config_size; uint32_t checksum; uint16_t version; uint8_t disk_number; uint8_t dummy_0; uint32_t total_sectors; uint32_t sector_size; uint8_t name[16]; uint8_t revision[4]; uint32_t disk_status; uint32_t magic_0; #define NVIDIA_MAGIC0 0x00640044 uint64_t volume_id[2]; uint8_t state; #define NVIDIA_S_IDLE 0 #define NVIDIA_S_INIT 2 #define NVIDIA_S_REBUILD 3 #define NVIDIA_S_UPGRADE 4 #define NVIDIA_S_SYNC 5 uint8_t array_width; uint8_t total_disks; uint8_t orig_array_width; uint16_t type; #define NVIDIA_T_RAID0 0x0080 #define NVIDIA_T_RAID1 0x0081 #define NVIDIA_T_RAID3 0x0083 #define NVIDIA_T_RAID5 0x0085 /* RLQ = 00/02? */ #define NVIDIA_T_RAID5_SYM 0x0095 /* RLQ = 03 */ #define NVIDIA_T_RAID10 0x008a #define NVIDIA_T_RAID01 0x8180 #define NVIDIA_T_CONCAT 0x00ff uint16_t dummy_3; uint32_t strip_sectors; uint32_t strip_bytes; uint32_t strip_shift; uint32_t strip_mask; uint32_t stripe_sectors; uint32_t stripe_bytes; uint32_t rebuild_lba; uint32_t orig_type; uint32_t orig_total_sectors; uint32_t status; #define NVIDIA_S_BOOTABLE 0x00000001 #define NVIDIA_S_DEGRADED 0x00000002 uint32_t filler[98]; } __packed; struct g_raid_md_nvidia_perdisk { struct nvidia_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_nvidia_object { struct g_raid_md_object mdio_base; uint64_t mdio_volume_id[2]; struct nvidia_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_nvidia; static g_raid_md_taste_t g_raid_md_taste_nvidia; static g_raid_md_event_t g_raid_md_event_nvidia; static g_raid_md_ctl_t g_raid_md_ctl_nvidia; static g_raid_md_write_t g_raid_md_write_nvidia; static g_raid_md_fail_disk_t g_raid_md_fail_disk_nvidia; static g_raid_md_free_disk_t g_raid_md_free_disk_nvidia; static g_raid_md_free_t g_raid_md_free_nvidia; static kobj_method_t g_raid_md_nvidia_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_nvidia), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_nvidia), KOBJMETHOD(g_raid_md_event, g_raid_md_event_nvidia), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_nvidia), KOBJMETHOD(g_raid_md_write, g_raid_md_write_nvidia), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_nvidia), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_nvidia), KOBJMETHOD(g_raid_md_free, g_raid_md_free_nvidia), { 0, 0 } }; static struct g_raid_md_class g_raid_md_nvidia_class = { "NVIDIA", g_raid_md_nvidia_methods, sizeof(struct g_raid_md_nvidia_object), .mdc_enable = 1, .mdc_priority = 100 }; static int NVIDIANodeID = 1; static void g_raid_md_nvidia_print(struct nvidia_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA NVIDIA RAID Metadata *********\n"); printf("nvidia_id <%.8s>\n", meta->nvidia_id); printf("config_size %u\n", meta->config_size); printf("checksum 0x%08x\n", meta->checksum); printf("version 0x%04x\n", meta->version); printf("disk_number %d\n", meta->disk_number); printf("dummy_0 0x%02x\n", meta->dummy_0); printf("total_sectors %u\n", meta->total_sectors); printf("sector_size %u\n", meta->sector_size); printf("name <%.16s>\n", meta->name); printf("revision 0x%02x%02x%02x%02x\n", meta->revision[0], meta->revision[1], meta->revision[2], meta->revision[3]); printf("disk_status 0x%08x\n", meta->disk_status); printf("magic_0 0x%08x\n", meta->magic_0); printf("volume_id 0x%016jx%016jx\n", meta->volume_id[1], meta->volume_id[0]); printf("state 0x%02x\n", meta->state); printf("array_width %u\n", meta->array_width); printf("total_disks %u\n", meta->total_disks); printf("orig_array_width %u\n", meta->orig_array_width); printf("type 0x%04x\n", meta->type); printf("dummy_3 0x%04x\n", meta->dummy_3); printf("strip_sectors %u\n", meta->strip_sectors); printf("strip_bytes %u\n", meta->strip_bytes); printf("strip_shift %u\n", meta->strip_shift); printf("strip_mask 0x%08x\n", meta->strip_mask); printf("stripe_sectors %u\n", meta->stripe_sectors); printf("stripe_bytes %u\n", meta->stripe_bytes); printf("rebuild_lba %u\n", meta->rebuild_lba); printf("orig_type 0x%04x\n", meta->orig_type); printf("orig_total_sectors %u\n", meta->orig_total_sectors); printf("status 0x%08x\n", meta->status); printf("=================================================\n"); } static struct nvidia_raid_conf * nvidia_meta_copy(struct nvidia_raid_conf *meta) { struct nvidia_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int nvidia_meta_translate_disk(struct nvidia_raid_conf *meta, int md_disk_pos) { int disk_pos; if (md_disk_pos >= 0 && meta->type == NVIDIA_T_RAID01) { disk_pos = (md_disk_pos / meta->array_width) + (md_disk_pos % meta->array_width) * meta->array_width; } else disk_pos = md_disk_pos; return (disk_pos); } static void nvidia_meta_get_name(struct nvidia_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void nvidia_meta_put_name(struct nvidia_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct nvidia_raid_conf * nvidia_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct nvidia_raid_conf *meta; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - 2 * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check if this is an NVIDIA RAID struct */ if (strncmp(meta->nvidia_id, NVIDIA_MAGIC, strlen(NVIDIA_MAGIC))) { G_RAID_DEBUG(1, "NVIDIA signature check failed on %s", pp->name); free(meta, M_MD_NVIDIA); return (NULL); } if (meta->config_size > 128 || meta->config_size < 30) { G_RAID_DEBUG(1, "NVIDIA metadata size looks wrong: %d", meta->config_size); free(meta, M_MD_NVIDIA); return (NULL); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "NVIDIA checksum check failed on %s", pp->name); free(meta, M_MD_NVIDIA); return (NULL); } /* Check volume state. */ if (meta->state != NVIDIA_S_IDLE && meta->state != NVIDIA_S_INIT && meta->state != NVIDIA_S_REBUILD && meta->state != NVIDIA_S_SYNC) { G_RAID_DEBUG(1, "NVIDIA unknown state on %s (0x%02x)", pp->name, meta->state); free(meta, M_MD_NVIDIA); return (NULL); } /* Check raid type. */ if (meta->type != NVIDIA_T_RAID0 && meta->type != NVIDIA_T_RAID1 && meta->type != NVIDIA_T_RAID3 && meta->type != NVIDIA_T_RAID5 && meta->type != NVIDIA_T_RAID5_SYM && meta->type != NVIDIA_T_RAID01 && meta->type != NVIDIA_T_CONCAT) { G_RAID_DEBUG(1, "NVIDIA unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_NVIDIA); return (NULL); } return (meta); } static int nvidia_meta_write(struct g_consumer *cp, struct nvidia_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write metadata. */ error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static int nvidia_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static struct g_raid_disk * g_raid_md_nvidia_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_nvidia_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_nvidia_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_nvidia_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd, *oldpd; struct nvidia_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by it's serial. */ if (pd->pd_meta != NULL) { disk_pos = pd->pd_meta->disk_number; if (disk_pos >= meta->total_disks || mdi->mdio_started) disk_pos = -3; } else disk_pos = -3; /* For RAID0+1 we need to translate order. */ disk_pos = nvidia_meta_translate_disk(meta, disk_pos); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 2 * 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_nvidia_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else// if (pd->pd_meta->disk_status == NVIDIA_S_CURRENT || //pd->pd_meta->disk_status == NVIDIA_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); // else // g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == NVIDIA_T_CONCAT) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->state == NVIDIA_S_REBUILD && (pd->pd_meta->disk_status & 0x100)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else if (meta->state == NVIDIA_S_SYNC) { /* Resyncing/dirty disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_nvidia_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_NVIDIA); } static void g_raid_md_nvidia_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_NVIDIA, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_nvidia_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_nvidia_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ nvidia_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == NVIDIA_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == NVIDIA_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == NVIDIA_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == NVIDIA_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == NVIDIA_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == NVIDIA_T_RAID5_SYM) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_nvidia_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_nvidia_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct nvidia_raid_conf *pdmeta; struct g_raid_md_nvidia_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_nvidia_start_disk(disk)) g_raid_md_write_nvidia(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || mdi->mdio_meta->disk_number >= mdi->mdio_meta->total_disks) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = nvidia_meta_copy(pdmeta); mdi->mdio_total_disks = pdmeta->total_disks; mdi->mdio_disks_present = 1; } else if (pdmeta->disk_number < mdi->mdio_meta->total_disks) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else G_RAID_DEBUG1(1, sc, "Spare disk"); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_nvidia_start(sc); } } static void g_raid_nvidia_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_nvidia_object *mdi; char name[32]; mdi = (struct g_raid_md_nvidia_object *)md; arc4rand(&mdi->mdio_volume_id, 16, 0); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_nvidia_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct nvidia_raid_conf *meta; struct g_raid_md_nvidia_perdisk *pd; struct g_geom *geom; - int error, result, spare, len; + int result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting NVIDIA on %s", cp->provider->name); mdi = (struct g_raid_md_nvidia_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; vendor = 0xffff; if (g_access(cp, 1, 0, 0) != 0) return (G_RAID_MD_TASTE_FAIL); g_topology_unlock(); len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = nvidia_meta_read(cp); g_topology_lock(); g_access(cp, -1, 0, 0); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x10de) { G_RAID_DEBUG(1, "No NVIDIA metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "NVIDIA vendor mismatch 0x%04x != 0x10de", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ g_raid_md_nvidia_print(meta); G_RAID_DEBUG(1, "NVIDIA disk position %d", meta->disk_number); spare = 0;//(meta->type == NVIDIA_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_nvidia_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (memcmp(&mdi1->mdio_volume_id, &meta->volume_id, 16) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_volume_id, &meta->volume_id, 16); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_nvidia_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-NVIDIA"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", - rcp->provider->name, error); + g_raid_get_disk_info(disk); g_raid_md_nvidia_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_NVIDIA); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) { /* Bump volume ID to drop missing disks. */ arc4rand(&mdi->mdio_volume_id, 16, 0); g_raid_md_nvidia_start(sc); } return (0); } return (-1); } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } if (mdi->mdio_started) { /* Bump volume ID to prevent disk resurrection. */ if (pd->pd_disk_pos >= 0) arc4rand(&mdi->mdio_volume_id, 16, 0); /* Write updated metadata to all disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); } /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_nvidia(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip, volsize; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_nvidia_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 2 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) volsize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) volsize = size; else if (level == G_RAID_VOLUME_RL_RAID5) volsize = size * (numdisks - 1); else { /* RAID1E */ volsize = ((size * numdisks) / strip / 2) * strip; } if (volsize > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; vol->v_mediasize = volsize; vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) nvidia_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_nvidia(md, NULL, disk); continue; } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ nvidia_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_SPARE && disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_nvidia(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); memcpy(meta->nvidia_id, NVIDIA_MAGIC, sizeof(NVIDIA_MAGIC) - 1); meta->config_size = 30; meta->version = 0x0064; meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->sector_size = vol->v_sectorsize; nvidia_meta_put_name(meta, vol->v_name); meta->magic_0 = NVIDIA_MAGIC0; memcpy(&meta->volume_id, &mdi->mdio_volume_id, 16); meta->state = NVIDIA_S_IDLE; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->array_width = 1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width = vol->v_disks_count / 2; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->array_width = vol->v_disks_count - 1; else meta->array_width = vol->v_disks_count; meta->total_disks = vol->v_disks_count; meta->orig_array_width = meta->array_width; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = NVIDIA_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = NVIDIA_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = NVIDIA_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = NVIDIA_T_CONCAT; else if (vol->v_raid_level_qualifier == G_RAID_VOLUME_RLQ_R5LA) meta->type = NVIDIA_T_RAID5; else meta->type = NVIDIA_T_RAID5_SYM; meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; meta->strip_bytes = vol->v_strip_size; meta->strip_shift = ffs(meta->strip_sectors) - 1; meta->strip_mask = meta->strip_sectors - 1; meta->stripe_sectors = meta->strip_sectors * meta->orig_array_width; meta->stripe_bytes = meta->stripe_sectors * vol->v_sectorsize; meta->rebuild_lba = 0; meta->orig_type = meta->type; meta->orig_total_sectors = meta->total_sectors; meta->status = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC || vol->v_dirty) && meta->state != NVIDIA_S_REBUILD) meta->state = NVIDIA_S_SYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_NEW || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) meta->state = NVIDIA_S_REBUILD; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = meta; spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } pd->pd_meta = nvidia_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { /* For RAID0+1 we need to translate order. */ pd->pd_meta->disk_number = nvidia_meta_translate_disk(meta, sd->sd_pos); if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { pd->pd_meta->disk_status = 0x100; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize * meta->array_width; } } else pd->pd_meta->disk_number = meta->total_disks + spares++; G_RAID_DEBUG(1, "Writing NVIDIA metadata to %s", g_raid_get_diskname(disk)); g_raid_md_nvidia_print(pd->pd_meta); nvidia_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_nvidia(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_nvidia_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_nvidia_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* Erase metadata to prevent disks's later resurrection. */ if (tdisk->d_consumer) nvidia_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } static int g_raid_md_free_disk_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_nvidia_perdisk *pd; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } free(pd, M_MD_NVIDIA); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_nvidia(struct g_raid_md_object *md) { struct g_raid_md_nvidia_object *mdi; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(nvidia, "NVIDIA"); Index: stable/8/sys/geom/raid/md_promise.c =================================================================== --- stable/8/sys/geom/raid/md_promise.c (revision 243678) +++ stable/8/sys/geom/raid/md_promise.c (revision 243679) @@ -1,1984 +1,1961 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata"); #define PROMISE_MAX_DISKS 8 #define PROMISE_MAX_SUBDISKS 2 #define PROMISE_META_OFFSET 14 struct promise_raid_disk { uint8_t flags; /* Subdisk status. */ #define PROMISE_F_VALID 0x01 #define PROMISE_F_ONLINE 0x02 #define PROMISE_F_ASSIGNED 0x04 #define PROMISE_F_SPARE 0x08 #define PROMISE_F_DUPLICATE 0x10 #define PROMISE_F_REDIR 0x20 #define PROMISE_F_DOWN 0x40 #define PROMISE_F_READY 0x80 uint8_t number; /* Position in a volume. */ uint8_t channel; /* ATA channel number. */ uint8_t device; /* ATA device number. */ uint64_t id __packed; /* Subdisk ID. */ } __packed; struct promise_raid_conf { char promise_id[24]; #define PROMISE_MAGIC "Promise Technology, Inc." #define FREEBSD_MAGIC "FreeBSD ATA driver RAID " uint32_t dummy_0; uint64_t magic_0; #define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \ ((uint64_t)(x.device != 0) << 56)) uint16_t magic_1; uint32_t magic_2; uint8_t filler1[470]; uint32_t integrity; #define PROMISE_I_VALID 0x00000080 struct promise_raid_disk disk; /* This subdisk info. */ uint32_t disk_offset; /* Subdisk offset. */ uint32_t disk_sectors; /* Subdisk size */ uint32_t rebuild_lba; /* Rebuild position. */ uint16_t generation; /* Generation number. */ uint8_t status; /* Volume status. */ #define PROMISE_S_VALID 0x01 #define PROMISE_S_ONLINE 0x02 #define PROMISE_S_INITED 0x04 #define PROMISE_S_READY 0x08 #define PROMISE_S_DEGRADED 0x10 #define PROMISE_S_MARKED 0x20 #define PROMISE_S_MIGRATING 0x40 #define PROMISE_S_FUNCTIONAL 0x80 uint8_t type; /* Voluem type. */ #define PROMISE_T_RAID0 0x00 #define PROMISE_T_RAID1 0x01 #define PROMISE_T_RAID3 0x02 #define PROMISE_T_RAID5 0x04 #define PROMISE_T_SPAN 0x08 #define PROMISE_T_JBOD 0x10 uint8_t total_disks; /* Disks in this volume. */ uint8_t stripe_shift; /* Strip size. */ uint8_t array_width; /* Number of RAID0 stripes. */ uint8_t array_number; /* Global volume number. */ uint32_t total_sectors; /* Volume size. */ uint16_t cylinders; /* Volume geometry: C. */ uint8_t heads; /* Volume geometry: H. */ uint8_t sectors; /* Volume geometry: S. */ uint64_t volume_id __packed; /* Volume ID, */ struct promise_raid_disk disks[PROMISE_MAX_DISKS]; /* Subdisks in this volume. */ char name[32]; /* Volume label. */ uint32_t filler2[8]; uint32_t magic_3; /* Something related to rebuild. */ uint64_t rebuild_lba64; /* Per-volume rebuild position. */ uint32_t magic_4; uint32_t magic_5; uint32_t total_sectors_high; uint32_t filler3[324]; uint32_t checksum; } __packed; struct g_raid_md_promise_perdisk { int pd_updated; int pd_subdisks; struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS]; }; struct g_raid_md_promise_pervolume { struct promise_raid_conf *pv_meta; uint64_t pv_id; uint16_t pv_generation; int pv_disks_present; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; static g_raid_md_create_t g_raid_md_create_promise; static g_raid_md_taste_t g_raid_md_taste_promise; static g_raid_md_event_t g_raid_md_event_promise; static g_raid_md_volume_event_t g_raid_md_volume_event_promise; static g_raid_md_ctl_t g_raid_md_ctl_promise; static g_raid_md_write_t g_raid_md_write_promise; static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise; static g_raid_md_free_disk_t g_raid_md_free_disk_promise; static g_raid_md_free_volume_t g_raid_md_free_volume_promise; static g_raid_md_free_t g_raid_md_free_promise; static kobj_method_t g_raid_md_promise_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise), KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise), KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise), KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise), { 0, 0 } }; static struct g_raid_md_class g_raid_md_promise_class = { "Promise", g_raid_md_promise_methods, sizeof(struct g_raid_md_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_promise_print(struct promise_raid_conf *meta) { int i; if (g_raid_debug < 1) return; printf("********* ATA Promise Metadata *********\n"); printf("promise_id <%.24s>\n", meta->promise_id); printf("disk %02x %02x %02x %02x %016jx\n", meta->disk.flags, meta->disk.number, meta->disk.channel, meta->disk.device, meta->disk.id); printf("disk_offset %u\n", meta->disk_offset); printf("disk_sectors %u\n", meta->disk_sectors); printf("rebuild_lba %u\n", meta->rebuild_lba); printf("generation %u\n", meta->generation); printf("status 0x%02x\n", meta->status); printf("type %u\n", meta->type); printf("total_disks %u\n", meta->total_disks); printf("stripe_shift %u\n", meta->stripe_shift); printf("array_width %u\n", meta->array_width); printf("array_number %u\n", meta->array_number); printf("total_sectors %u\n", meta->total_sectors); printf("cylinders %u\n", meta->cylinders); printf("heads %u\n", meta->heads); printf("sectors %u\n", meta->sectors); printf("volume_id 0x%016jx\n", meta->volume_id); printf("disks:\n"); for (i = 0; i < PROMISE_MAX_DISKS; i++ ) { printf(" %02x %02x %02x %02x %016jx\n", meta->disks[i].flags, meta->disks[i].number, meta->disks[i].channel, meta->disks[i].device, meta->disks[i].id); } printf("name <%.32s>\n", meta->name); printf("magic_3 0x%08x\n", meta->magic_3); printf("rebuild_lba64 %ju\n", meta->rebuild_lba64); printf("magic_4 0x%08x\n", meta->magic_4); printf("magic_5 0x%08x\n", meta->magic_5); printf("total_sectors_high 0x%08x\n", meta->total_sectors_high); printf("=================================================\n"); } static struct promise_raid_conf * promise_meta_copy(struct promise_raid_conf *meta) { struct promise_raid_conf *nmeta; nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK); memcpy(nmeta, meta, sizeof(*nmeta)); return (nmeta); } static int promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (meta->disks[pos].id == id) return (pos); } return (-1); } static int promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd, uint32_t sectors, uint32_t *off, uint32_t *size) { uint32_t coff, csize; int i, j; sectors -= 131072; *off = 0; *size = 0; coff = 0; csize = sectors; i = 0; while (1) { for (j = 0; j < nsd; j++) { if (metaarr[j]->disk_offset >= coff) { csize = MIN(csize, metaarr[j]->disk_offset - coff); } } if (csize > *size) { *off = coff; *size = csize; } if (i >= nsd) break; coff = metaarr[i]->disk_offset + metaarr[i]->disk_sectors; csize = sectors - coff; i++; }; return ((*size > 0) ? 1 : 0); } static int promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos) { int disk_pos, width; if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { width = vol->v_disks_count / 2; disk_pos = (md_disk_pos / width) + (md_disk_pos % width) * width; } else disk_pos = md_disk_pos; return (disk_pos); } static void promise_meta_get_name(struct promise_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 32); buf[32] = 0; for (i = 31; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void promise_meta_put_name(struct promise_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 32); memcpy(meta->name, buf, MIN(strlen(buf), 32)); } static int promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; int error, i, subdisks; uint32_t checksum, *ptr; pp = cp->provider; subdisks = 0; next: /* Read metadata block. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisks * PROMISE_META_OFFSET), pp->sectorsize * 4, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (subdisks); } meta = (struct promise_raid_conf *)buf; /* Check if this is an Promise RAID struct */ if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) && strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) { if (subdisks == 0) G_RAID_DEBUG(1, "Promise signature check failed on %s", pp->name); g_free(buf); return (subdisks); } meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK); memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if ((meta->integrity & PROMISE_I_VALID) == 0) { G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if (meta->total_disks > PROMISE_MAX_DISKS) { G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)", pp->name, meta->total_disks); free(meta, M_MD_PROMISE); return (subdisks); } /* Save this part and look for next. */ *metaarr = meta; metaarr++; subdisks++; if (subdisks < PROMISE_MAX_SUBDISKS) goto next; return (subdisks); } static int promise_meta_write(struct g_consumer *cp, struct promise_raid_conf **metaarr, int nsd) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; int error, i, subdisk, fake; uint32_t checksum, *ptr, off, size; pp = cp->provider; subdisk = 0; fake = 0; next: buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO); meta = NULL; if (subdisk < nsd) { meta = metaarr[subdisk]; } else if (!fake && promise_meta_unused_range(metaarr, nsd, cp->provider->mediasize / cp->provider->sectorsize, &off, &size)) { /* Optionally add record for unused space. */ meta = (struct promise_raid_conf *)buf; memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); meta->disk_offset = off; meta->disk_sectors = size; meta->rebuild_lba = UINT32_MAX; fake = 1; } if (meta != NULL) { /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; meta->checksum = checksum; memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta))); } error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, pp->sectorsize * 4); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_PROMISE); subdisk++; if (subdisk < PROMISE_MAX_SUBDISKS) goto next; return (error); } static int promise_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, subdisk; pp = cp->provider; buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO); for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) { error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, 4 * pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_PROMISE); return (error); } static int promise_meta_write_spare(struct g_consumer *cp) { struct promise_raid_conf *meta; int error; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); meta->disk_sectors = cp->provider->mediasize / cp->provider->sectorsize; meta->disk_sectors -= 131072; meta->rebuild_lba = UINT32_MAX; error = promise_meta_write(cp, &meta, 1); free(meta, M_MD_PROMISE); return (error); } static struct g_raid_volume * g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id) { struct g_raid_volume *vol; struct g_raid_md_promise_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (pv->pv_id == id) break; } return (vol); } static int g_raid_md_promise_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_promise_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_promise_purge_disks(struct g_raid_softc *sc) { struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_promise_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_promise_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_PROMISE); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); } static int g_raid_md_promise_supported(int level, int qual, int disks, int force) { if (disks > PROMISE_MAX_DISKS) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t size; int disk_pos, md_disk_pos, i, resurrection = 0; uint32_t eoff, esize; sc = disk->d_softc; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; pv = vol->v_md_data; meta = pv->pv_meta; if (sdn >= 0) { /* Find disk position in metadata by it's serial. */ md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id); /* For RAID0+1 we need to translate order. */ disk_pos = promise_meta_translate_disk(vol, md_disk_pos); } else { md_disk_pos = -1; disk_pos = -1; } if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if (sdn >= 0 && pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we were given specific metadata subdisk - erase it. */ if (sdn >= 0) { free(pd->pd_meta[sdn], M_MD_PROMISE); for (i = sdn; i < pd->pd_subdisks - 1; i++) pd->pd_meta[i] = pd->pd_meta[i + 1]; pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL; pd->pd_subdisks--; } /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, disk->d_consumer->provider->mediasize / disk->d_consumer->provider->sectorsize, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && (off_t)esize * 512 < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), (off_t)esize * 512, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size / 512; /* For RAID0+1 we need to translate order. */ md_disk_pos = promise_meta_translate_disk(vol, disk_pos); } else { nofit: if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); } return (0); } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = (off_t)eoff * 512; sd->sd_size = (off_t)esize * 512; } else { sd->sd_offset = (off_t)pd->pd_meta[sdn]->disk_offset * 512; sd->sd_size = (off_t)pd->pd_meta[sdn]->disk_sectors * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta[sdn]->generation != meta->generation) sd->sd_rebuild_pos = 0; else { sd->sd_rebuild_pos = (off_t)pd->pd_meta[sdn]->rebuild_lba * 512; } } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta[sdn]->generation != meta->generation || (meta->status & PROMISE_S_MARKED)) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_promise_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) { update = g_raid_md_promise_start_disk(disk, -1, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_promise(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_promise_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; int i; sc = vol->v_softc; md = sc->sc_md; pv = vol->v_md_data; meta = pv->pv_meta; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == PROMISE_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (meta->type == PROMISE_T_RAID1) { if (meta->array_width == 1) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (meta->type == PROMISE_T_RAID3) vol->v_raid_level = G_RAID_VOLUME_RL_RAID3; else if (meta->type == PROMISE_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else if (meta->type == PROMISE_T_SPAN) vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; else if (meta->type == PROMISE_T_JBOD) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ vol->v_disks_count = meta->total_disks; vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ if (meta->total_sectors_high < 256) /* If value looks sane. */ vol->v_mediasize |= ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ vol->v_sectorsize = 512; //ZZZ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; sd->sd_offset = (off_t)meta->disk_offset * 512; //ZZZ sd->sd_size = (off_t)meta->disk_sectors * 512; //ZZZ } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i]->volume_id == meta->volume_id) g_raid_md_promise_start_disk(disk, i, vol); } } pv->pv_started = 1; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_promise_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_promise_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_promise_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct promise_raid_conf *pdmeta; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_raid_volume *vol; int i; char buf[33]; sc = disk->d_softc; md = sc->sc_md; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_promise_refill(sc); return; } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) { promise_meta_get_name(pdmeta, buf); vol = g_raid_create_volume(sc, buf, pdmeta->array_number); pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); pv->pv_id = pdmeta->volume_id; vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_promise_go, vol); } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ if (pv->pv_meta == NULL || !pv->pv_started) { if (pv->pv_meta == NULL || ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = promise_meta_copy(pdmeta); pv->pv_generation = pv->pv_meta->generation; pv->pv_disks_present = 1; } else if (pdmeta->generation == pv->pv_generation) { pv->pv_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", pv->pv_disks_present, pv->pv_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } } } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) continue; pv = vol->v_md_data; if (pv->pv_started) { if (g_raid_md_promise_start_disk(disk, i, vol)) g_raid_md_write_promise(md, vol, NULL, NULL); } else { /* If we collected all needed disks - start array. */ if (pv->pv_disks_present == pv->pv_meta->total_disks) g_raid_md_promise_start(vol); } } } static int g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ sc = g_raid_create_node(mp, "Promise", md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct promise_raid_conf *meta, *metaarr[4]; struct g_raid_md_promise_perdisk *pd; struct g_geom *geom; - int error, i, j, result, len, subdisks; + int i, j, result, len, subdisks; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name); pp = cp->provider; /* Read metadata from device. */ meta = NULL; vendor = 0xffff; if (g_access(cp, 1, 0, 0) != 0) return (G_RAID_MD_TASTE_FAIL); g_topology_unlock(); len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); subdisks = promise_meta_read(cp, metaarr); g_topology_lock(); g_access(cp, -1, 0, 0); if (subdisks == 0) { if (g_raid_aggressive_spare) { if (vendor == 0x105a || vendor == 0x1002) { G_RAID_DEBUG(1, "No Promise metadata, forcing spare."); goto search; } else { G_RAID_DEBUG(1, "Promise/ATI vendor mismatch " "0x%04x != 0x105a/0x1002", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ for (i = 0; i < subdisks; i++) g_raid_md_promise_print(metaarr[i]); /* Purge meaningless (empty/spare) records. */ for (i = 0; i < subdisks; ) { if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) { i++; continue; } free(metaarr[i], M_MD_PROMISE); for (j = i; j < subdisks - 1; j++) metaarr[i] = metaarr[j + 1]; metaarr[PROMISE_MAX_SUBDISKS - 1] = NULL; subdisks--; } search: /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; snprintf(name, sizeof(name), "Promise"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); pd->pd_subdisks = subdisks; for (i = 0; i < subdisks; i++) pd->pd_meta[i] = metaarr[i]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", - rcp->provider->name, error); + g_raid_get_disk_info(disk); g_raid_md_promise_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_promise(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_promise(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_promise_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_promise(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS]; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; uint32_t offs[PROMISE_MAX_DISKS], esize; int numdisks, i, len, level, qual; int error; sc = md->mdo_softc; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_promise_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) { gctl_error(req, "Disk '%s' already " "used by %d volumes.", diskname, pd->pd_subdisks); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, pp->mediasize / pp->sectorsize, &offs[i], &esize); size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; g_topology_unlock(); if (pp->mediasize / pp->sectorsize > UINT32_MAX) { gctl_error(req, "Disk '%s' is too big.", diskname); error = -8; break; } - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0); pv->pv_generation = 0; pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = (off_t)offs[i] * 512; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_promise_purge_disks(sc); g_raid_md_write_promise(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_promise(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); if (pp->mediasize / pp->sectorsize > UINT32_MAX) { gctl_error(req, "Disk '%s' is too big.", diskname); g_raid_kill_consumer(sc, cp); error = -8; break; } pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); promise_meta_write_spare(cp); g_raid_md_promise_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t rebuild_lba64; int i, j, pos, rebuild; sc = md->mdo_softc; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Generate new per-volume metadata for affected volumes. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; /* Skip volumes not related to specified targets. */ if (tvol != NULL && vol != tvol) continue; if (tsd != NULL && vol != tsd->sd_volume) continue; if (tdisk != NULL) { for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_disk == tdisk) break; } if (i >= vol->v_disks_count) continue; } pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; pv->pv_generation++; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); if (pv->pv_meta != NULL) memcpy(meta, pv->pv_meta, sizeof(*meta)); memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->generation = pv->pv_generation; meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE | PROMISE_S_INITED | PROMISE_S_READY; if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) meta->status |= PROMISE_S_DEGRADED; if (vol->v_dirty) meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = PROMISE_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = PROMISE_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) meta->type = PROMISE_T_RAID3; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->type = PROMISE_T_RAID5; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) meta->type = PROMISE_T_SPAN; else meta->type = PROMISE_T_JBOD; meta->total_disks = vol->v_disks_count; meta->stripe_shift = ffs(vol->v_strip_size / 1024); meta->array_width = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width /= 2; meta->array_number = vol->v_global_id; meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->total_sectors_high = (vol->v_mediasize / vol->v_sectorsize) >> 32; meta->cylinders = meta->total_sectors / (255 * 63) - 1; meta->heads = 254; meta->sectors = 63; meta->volume_id = pv->pv_id; rebuild_lba64 = UINT64_MAX; rebuild = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); meta->disks[pos].flags = PROMISE_F_VALID | PROMISE_F_ASSIGNED; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) { meta->disks[pos].flags |= 0; } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) { meta->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) { meta->disks[pos].flags |= PROMISE_F_ONLINE | PROMISE_F_REDIR; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; rebuild = 1; } else { meta->disks[pos].flags |= PROMISE_F_ONLINE; if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) { meta->status |= PROMISE_S_MARKED; if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; } } if (pv->pv_meta != NULL) { meta->disks[pos].id = pv->pv_meta->disks[pos].id; } else { meta->disks[pos].number = i * 2; arc4rand(&meta->disks[pos].id, sizeof(meta->disks[pos].id), 0); } } promise_meta_put_name(meta, vol->v_name); /* Try to mimic AMD BIOS rebuild/resync behavior. */ if (rebuild_lba64 != UINT64_MAX) { if (rebuild) meta->magic_3 = 0x03040010UL; /* Rebuild? */ else meta->magic_3 = 0x03040008UL; /* Resync? */ /* Translate from per-disk to per-volume LBA. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { rebuild_lba64 *= meta->array_width; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) { rebuild_lba64 *= meta->array_width - 1; } else rebuild_lba64 = 0; } else meta->magic_3 = 0x03000000UL; meta->rebuild_lba64 = rebuild_lba64; meta->magic_4 = 0x04010101UL; /* Replace per-volume metadata with new. */ if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = meta; /* Copy new metadata to the disks, adding or replacing old. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; disk = sd->sd_disk; if (disk == NULL) continue; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (j = 0; j < pd->pd_subdisks; j++) { if (pd->pd_meta[j]->volume_id == meta->volume_id) break; } if (j == pd->pd_subdisks) pd->pd_subdisks++; if (pd->pd_meta[j] != NULL) free(pd->pd_meta[j], M_MD_PROMISE); pd->pd_meta[j] = promise_meta_copy(meta); pd->pd_meta[j]->disk = meta->disks[pos]; pd->pd_meta[j]->disk.number = pos; pd->pd_meta[j]->disk_offset = sd->sd_offset / 512; pd->pd_meta[j]->disk_sectors = sd->sd_size / 512; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->rebuild_lba = sd->sd_rebuild_pos / 512; } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) pd->pd_meta[j]->rebuild_lba = 0; else pd->pd_meta[j]->rebuild_lba = UINT32_MAX; pd->pd_updated = 1; } } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (!pd->pd_updated) continue; G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(disk)); for (i = 0; i < pd->pd_subdisks; i++) g_raid_md_promise_print(pd->pd_meta[i]); promise_meta_write(disk->d_consumer, pd->pd_meta, pd->pd_subdisks); pd->pd_updated = 0; } return (0); } static int g_raid_md_fail_disk_promise(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_promise_perdisk *pd; struct g_raid_subdisk *sd; int i, pos; sc = md->mdo_softc; pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL) G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(tdisk)); for (i = 0; i < pd->pd_subdisks; i++) { pd->pd_meta[i]->disk.flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; pos = pd->pd_meta[i]->disk.number; if (pos >= 0 && pos < PROMISE_MAX_DISKS) { pd->pd_meta[i]->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } g_raid_md_promise_print(pd->pd_meta[i]); } if (tdisk->d_consumer != NULL) promise_meta_write(tdisk->d_consumer, pd->pd_meta, pd->pd_subdisks); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, tdisk); g_raid_md_promise_refill(sc); return (0); } static int g_raid_md_free_disk_promise(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_promise_perdisk *pd; int i; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i] != NULL) { free(pd->pd_meta[i], M_MD_PROMISE); pd->pd_meta[i] = NULL; } } free(pd, M_MD_PROMISE); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_promise(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; if (pv && pv->pv_meta != NULL) { free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = NULL; } if (pv && !pv->pv_started) { pv->pv_started = 1; callout_stop(&pv->pv_start_co); } free(pv, M_MD_PROMISE); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_promise(struct g_raid_md_object *md) { return (0); } G_RAID_MD_DECLARE(promise, "Promise"); Index: stable/8/sys/geom/raid/md_sii.c =================================================================== --- stable/8/sys/geom/raid/md_sii.c (revision 243678) +++ stable/8/sys/geom/raid/md_sii.c (revision 243679) @@ -1,1692 +1,1669 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata"); struct sii_raid_conf { uint16_t ata_params_00_53[54]; uint64_t total_sectors; /* 54 - 57 */ uint16_t ata_params_58_81[72]; uint16_t product_id; /* 130 */ uint16_t vendor_id; /* 131 */ uint16_t version_minor; /* 132 */ uint16_t version_major; /* 133 */ uint8_t timestamp[6]; /* 134 - 136 */ uint16_t strip_sectors; /* 137 */ uint16_t dummy_2; uint8_t disk_number; /* 139 */ uint8_t type; #define SII_T_RAID0 0x00 #define SII_T_RAID1 0x01 #define SII_T_RAID01 0x02 #define SII_T_SPARE 0x03 #define SII_T_CONCAT 0x04 #define SII_T_RAID5 0x10 #define SII_T_RESERVED 0xfd #define SII_T_JBOD 0xff uint8_t raid0_disks; /* 140 */ uint8_t raid0_ident; uint8_t raid1_disks; /* 141 */ uint8_t raid1_ident; uint64_t rebuild_lba; /* 142 - 145 */ uint32_t generation; /* 146 - 147 */ uint8_t disk_status; /* 148 */ #define SII_S_CURRENT 0x01 #define SII_S_REBUILD 0x02 #define SII_S_DROPPED 0x03 #define SII_S_REMOVED 0x04 uint8_t raid_status; #define SII_S_ONLINE 0x01 #define SII_S_AVAILABLE 0x02 uint8_t raid_location; /* 149 */ uint8_t disk_location; uint8_t auto_rebuild; /* 150 */ #define SII_R_REBUILD 0x00 #define SII_R_NOREBUILD 0xff uint8_t dummy_3; uint8_t name[16]; /* 151 - 158 */ uint16_t checksum; /* 159 */ uint16_t ata_params_160_255[96]; } __packed; struct g_raid_md_sii_perdisk { struct sii_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_sii_object { struct g_raid_md_object mdio_base; uint8_t mdio_timestamp[6]; uint8_t mdio_location; uint32_t mdio_generation; struct sii_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_sii; static g_raid_md_taste_t g_raid_md_taste_sii; static g_raid_md_event_t g_raid_md_event_sii; static g_raid_md_ctl_t g_raid_md_ctl_sii; static g_raid_md_write_t g_raid_md_write_sii; static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii; static g_raid_md_free_disk_t g_raid_md_free_disk_sii; static g_raid_md_free_t g_raid_md_free_sii; static kobj_method_t g_raid_md_sii_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_sii), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_sii), KOBJMETHOD(g_raid_md_event, g_raid_md_event_sii), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_sii), KOBJMETHOD(g_raid_md_write, g_raid_md_write_sii), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_sii), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_sii), KOBJMETHOD(g_raid_md_free, g_raid_md_free_sii), { 0, 0 } }; static struct g_raid_md_class g_raid_md_sii_class = { "SiI", g_raid_md_sii_methods, sizeof(struct g_raid_md_sii_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_sii_print(struct sii_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA SiI RAID Metadata *********\n"); printf("total_sectors %llu\n", (long long unsigned)meta->total_sectors); printf("product_id 0x%04x\n", meta->product_id); printf("vendor_id 0x%04x\n", meta->vendor_id); printf("version_minor 0x%04x\n", meta->version_minor); printf("version_major 0x%04x\n", meta->version_major); printf("timestamp 0x%02x%02x%02x%02x%02x%02x\n", meta->timestamp[5], meta->timestamp[4], meta->timestamp[3], meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]); printf("strip_sectors %d\n", meta->strip_sectors); printf("disk_number %d\n", meta->disk_number); printf("type 0x%02x\n", meta->type); printf("raid0_disks %d\n", meta->raid0_disks); printf("raid0_ident %d\n", meta->raid0_ident); printf("raid1_disks %d\n", meta->raid1_disks); printf("raid1_ident %d\n", meta->raid1_ident); printf("rebuild_lba %llu\n", (long long unsigned)meta->rebuild_lba); printf("generation %d\n", meta->generation); printf("disk_status %d\n", meta->disk_status); printf("raid_status %d\n", meta->raid_status); printf("raid_location %d\n", meta->raid_location); printf("disk_location %d\n", meta->disk_location); printf("auto_rebuild %d\n", meta->auto_rebuild); printf("name <%.16s>\n", meta->name); printf("checksum 0x%04x\n", meta->checksum); printf("=================================================\n"); } static struct sii_raid_conf * sii_meta_copy(struct sii_raid_conf *meta) { struct sii_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int sii_meta_total_disks(struct sii_raid_conf *meta) { switch (meta->type) { case SII_T_RAID0: case SII_T_RAID5: case SII_T_CONCAT: return (meta->raid0_disks); case SII_T_RAID1: return (meta->raid1_disks); case SII_T_RAID01: return (meta->raid0_disks * meta->raid1_disks); case SII_T_SPARE: case SII_T_JBOD: return (1); } return (0); } static int sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta) { if (pdmeta->type == SII_T_SPARE) return (-3); if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0) return (-1); switch (pdmeta->type) { case SII_T_RAID0: case SII_T_RAID1: case SII_T_RAID5: case SII_T_CONCAT: return (pdmeta->disk_number); case SII_T_RAID01: return (pdmeta->raid1_ident * pdmeta->raid1_disks + pdmeta->raid0_ident); case SII_T_JBOD: return (0); } return (-1); } static void sii_meta_get_name(struct sii_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void sii_meta_put_name(struct sii_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct sii_raid_conf * sii_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct sii_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check vendor ID. */ if (meta->vendor_id != 0x1095) { G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)", pp->name, meta->vendor_id); free(meta, M_MD_SII); return (NULL); } /* Check metadata major version. */ if (meta->version_major != 2) { G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)", pp->name, meta->version_major, meta->version_minor); free(meta, M_MD_SII); return (NULL); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name); free(meta, M_MD_SII); return (NULL); } /* Check raid type. */ if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 && meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE && meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT && meta->type != SII_T_JBOD) { G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_SII); return (NULL); } return (meta); } static int sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); break; } } free(buf, M_MD_SII); return (error); } static int sii_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, i; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_SII); return (error); } static int sii_meta_write_spare(struct g_consumer *cp) { struct sii_raid_conf *meta; int error; meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); meta->total_sectors = cp->provider->mediasize / cp->provider->sectorsize - 0x800; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; meta->timestamp[0] = arc4random(); meta->timestamp[1] = arc4random(); meta->timestamp[2] = arc4random(); meta->timestamp[3] = arc4random(); meta->timestamp[4] = arc4random(); meta->timestamp[5] = arc4random(); meta->type = SII_T_SPARE; meta->generation = 1; meta->raid1_ident = 0xff; meta->raid_location = arc4random(); error = sii_meta_write(cp, meta); free(meta, M_MD_SII); return (error); } static struct g_raid_disk * g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_sii_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_sii_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_sii_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd, *oldpd; struct sii_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; olddisk = NULL; /* Find disk position in metadata by it's serial. */ if (pd->pd_meta != NULL) disk_pos = sii_meta_disk_pos(meta, pd->pd_meta); else disk_pos = -3; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_sii_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (pd->pd_meta->disk_status == SII_S_CURRENT || pd->pd_meta->disk_status == SII_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->disk_status == SII_S_REBUILD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta->generation == meta->generation) sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512; else sd->sd_rebuild_pos = 0; } else if (pd->pd_meta->disk_status == SII_S_CURRENT) { if (pd->pd_meta->raid_status == SII_S_ONLINE || pd->pd_meta->generation != meta->generation) { /* Dirty or resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_sii_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_SII); } static void g_raid_md_sii_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_SII, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_sii_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_sii_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *best; off_t size; int j, disk_pos; uint32_t gendiff, bestgendiff; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ sii_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == SII_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == SII_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == SII_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == SII_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == SII_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == SII_T_JBOD) { vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; size = 0; } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* * Make all disks found till the moment take their places * in order of their generation numbers. */ do { best = NULL; bestgendiff = 0xffffffff; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_RAID_DISK_S_NONE) continue; pd = disk->d_md_data; if (pd->pd_meta == NULL) gendiff = 0xfffffffe; else gendiff = meta->generation - pd->pd_meta->generation; if (gendiff < bestgendiff) { best = disk; bestgendiff = gendiff; } } if (best != NULL) g_raid_md_sii_start_disk(best); } while (best != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_sii_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct sii_raid_conf *pdmeta; struct g_raid_md_sii_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_sii_start_disk(disk)) g_raid_md_write_sii(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = sii_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_total_disks = sii_meta_total_disks(pdmeta); mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_sii_start(sc); } } static void g_raid_sii_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_sii_object *mdi; char name[32]; mdi = (struct g_raid_md_sii_object *)md; mdi->mdio_timestamp[5] = arc4random(); mdi->mdio_timestamp[4] = arc4random(); mdi->mdio_timestamp[3] = arc4random(); mdi->mdio_timestamp[2] = arc4random(); mdi->mdio_timestamp[1] = arc4random(); mdi->mdio_timestamp[0] = arc4random(); mdi->mdio_location = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_sii_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct sii_raid_conf *meta; struct g_raid_md_sii_perdisk *pd; struct g_geom *geom; - int error, disk_pos, result, spare, len; + int disk_pos, result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name); mdi = (struct g_raid_md_sii_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; vendor = 0xffff; if (g_access(cp, 1, 0, 0) != 0) return (G_RAID_MD_TASTE_FAIL); g_topology_unlock(); len = 2; if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = sii_meta_read(cp); g_topology_lock(); g_access(cp, -1, 0, 0); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x1095) { G_RAID_DEBUG(1, "No SiI metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "SiI vendor mismatch 0x%04x != 0x1095", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = sii_meta_disk_pos(meta, meta); if (disk_pos == -1) { G_RAID_DEBUG(1, "SiI disk position not found"); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_sii_print(meta); G_RAID_DEBUG(1, "SiI disk position %d", disk_pos); spare = (meta->type == SII_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_sii_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_location == meta->raid_location && memcmp(&mdi1->mdio_timestamp, &meta->timestamp, 6) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6); mdi->mdio_location = meta->raid_location; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_sii_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-SiI"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } rcp = g_new_consumer(geom); g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", - rcp->provider->name, error); + g_raid_get_disk_info(disk); g_raid_md_sii_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_SII); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_sii(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_sii_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_sii(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_sii_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 0x800 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) sii_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_sii(md, NULL, disk); continue; } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ sii_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); - /* Read kernel dumping information. */ - disk->d_kd.offset = 0; - disk->d_kd.length = OFF_MAX; - len = sizeof(disk->d_kd); - g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); - if (disk->d_kd.di.dumper == NULL) - G_RAID_DEBUG1(2, sc, - "Dumping not supported by %s.", - cp->provider->name); + g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { sii_meta_write_spare(cp); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; int i; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6); meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) { meta->type = SII_T_RAID0; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { meta->type = SII_T_RAID1; meta->raid0_disks = 0xff; meta->raid1_disks = vol->v_disks_count; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { meta->type = SII_T_RAID01; meta->raid0_disks = vol->v_disks_count / 2; meta->raid1_disks = 2; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) { meta->type = SII_T_JBOD; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else { meta->type = SII_T_RAID5; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } meta->generation = mdi->mdio_generation; meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) meta->raid_status = SII_S_ONLINE; } meta->raid_location = mdi->mdio_location; sii_meta_put_name(meta, vol->v_name); /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } pd->pd_meta = sii_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { if (sd->sd_state < G_RAID_SUBDISK_S_NEW) pd->pd_meta->disk_status = SII_S_DROPPED; else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) { pd->pd_meta->disk_status = SII_S_REBUILD; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize; } else pd->pd_meta->disk_status = SII_S_CURRENT; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0xff; pd->pd_meta->raid1_ident = 0; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks; pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks; pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks; } else { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0; pd->pd_meta->raid1_ident = 0xff; } } G_RAID_DEBUG(1, "Writing SiI metadata to %s", g_raid_get_diskname(disk)); g_raid_md_sii_print(pd->pd_meta); sii_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_sii(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_sii_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (tdisk->d_consumer) { if (pd->pd_meta) { pd->pd_meta->disk_status = SII_S_REMOVED; sii_meta_write(tdisk->d_consumer, pd->pd_meta); } else sii_meta_erase(tdisk->d_consumer); } /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } static int g_raid_md_free_disk_sii(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_sii_perdisk *pd; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } free(pd, M_MD_SII); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_sii(struct g_raid_md_object *md) { struct g_raid_md_sii_object *mdi; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(sii, "SiI"); Index: stable/8/sys/geom/raid/tr_concat.c =================================================================== --- stable/8/sys/geom/raid/tr_concat.c (revision 243678) +++ stable/8/sys/geom/raid/tr_concat.c (revision 243679) @@ -1,344 +1,345 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data"); struct g_raid_tr_concat_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopped; }; static g_raid_tr_taste_t g_raid_tr_taste_concat; static g_raid_tr_event_t g_raid_tr_event_concat; static g_raid_tr_start_t g_raid_tr_start_concat; static g_raid_tr_stop_t g_raid_tr_stop_concat; static g_raid_tr_iostart_t g_raid_tr_iostart_concat; static g_raid_tr_iodone_t g_raid_tr_iodone_concat; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat; static g_raid_tr_free_t g_raid_tr_free_concat; static kobj_method_t g_raid_tr_concat_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_concat), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_concat), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_concat), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_concat), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_concat), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_concat), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_concat), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_concat), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_concat_class = { "CONCAT", g_raid_tr_concat_methods, sizeof(struct g_raid_tr_concat_object), .trc_enable = 1, .trc_priority = 50 }; static int g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume) { struct g_raid_tr_concat_object *trs; trs = (struct g_raid_tr_concat_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT && !(tr->tro_volume->v_disks_count == 1 && tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_concat(struct g_raid_volume *vol) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; off_t size; u_int s; int i, n, f; sc = vol->v_softc; trs = (struct g_raid_tr_concat_object *)vol->v_tr; if (trs->trso_stopped) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); if (n + f == vol->v_disks_count) { if (f == 0) s = G_RAID_VOLUME_S_OPTIMAL; else s = G_RAID_VOLUME_S_SUBOPTIMAL; } else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { /* * Some metadata modules may not know CONCAT volume * mediasize until all disks connected. Recalculate. */ if (G_RAID_VOLUME_S_ALIVE(s) && !G_RAID_VOLUME_S_ALIVE(vol->v_state)) { size = 0; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) size += vol->v_subdisks[i].sd_size; } vol->v_mediasize = size; } g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { struct g_raid_tr_concat_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; int state; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; sc = vol->v_softc; state = sd->sd_state; if (state != G_RAID_SUBDISK_S_NONE && state != G_RAID_SUBDISK_S_FAILED && state != G_RAID_SUBDISK_S_ACTIVE) { G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } if (state != sd->sd_state && !trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, sd, NULL); g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_start_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_concat(vol); return (0); } static int g_raid_tr_stop_concat(struct g_raid_tr_object *tr) { struct g_raid_tr_concat_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_concat_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopped = 1; g_raid_tr_update_state_concat(vol); return (0); } static void g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, length, remain; u_int no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } if (bp->bio_cmd == BIO_FLUSH) { g_raid_tr_flush_common(tr, bp); return; } offset = bp->bio_offset; remain = bp->bio_length; addr = bp->bio_data; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", bp->bio_offset)); bioq_init(&queue); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_length = length; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); remain -= length; - addr += length; + if (bp->bio_cmd != BIO_DELETE) + addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %ju)", bp->bio_offset, bp->bio_length)); } while (remain > 0); for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static int g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; char *addr; off_t offset, length, remain; int error, no; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) return (ENXIO); offset = boffset; remain = blength; addr = virtual; no = 0; while (no < vol->v_disks_count && offset >= vol->v_subdisks[no].sd_size) { offset -= vol->v_subdisks[no].sd_size; no++; } KASSERT(no < vol->v_disks_count, ("Request starts after volume end (%ju)", boffset)); do { sd = &vol->v_subdisks[no]; length = MIN(sd->sd_size - offset, remain); error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], addr, 0, offset, length); if (error != 0) return (error); remain -= length; addr += length; offset = 0; no++; KASSERT(no < vol->v_disks_count || remain == 0, ("Request ends after volume end (%ju, %zu)", boffset, blength)); } while (remain > 0); return (0); } static void g_raid_tr_iodone_concat(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd,struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, bp->bio_error); } } static int g_raid_tr_free_concat(struct g_raid_tr_object *tr) { return (0); } G_RAID_TR_DECLARE(concat, "CONCAT"); Index: stable/8/sys/geom/raid/tr_raid0.c =================================================================== --- stable/8/sys/geom/raid/tr_raid0.c (revision 243678) +++ stable/8/sys/geom/raid/tr_raid0.c (revision 243679) @@ -1,327 +1,328 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data"); struct g_raid_tr_raid0_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopped; }; static g_raid_tr_taste_t g_raid_tr_taste_raid0; static g_raid_tr_event_t g_raid_tr_event_raid0; static g_raid_tr_start_t g_raid_tr_start_raid0; static g_raid_tr_stop_t g_raid_tr_stop_raid0; static g_raid_tr_iostart_t g_raid_tr_iostart_raid0; static g_raid_tr_iodone_t g_raid_tr_iodone_raid0; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0; static g_raid_tr_free_t g_raid_tr_free_raid0; static kobj_method_t g_raid_tr_raid0_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid0), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid0), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid0), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid0), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid0), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid0), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid0), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid0), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid0_class = { "RAID0", g_raid_tr_raid0_methods, sizeof(struct g_raid_tr_raid0_object), .trc_enable = 1, .trc_priority = 100 }; static int g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume) { struct g_raid_tr_raid0_object *trs; trs = (struct g_raid_tr_raid0_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid0(struct g_raid_volume *vol) { struct g_raid_tr_raid0_object *trs; struct g_raid_softc *sc; u_int s; int n, f; sc = vol->v_softc; trs = (struct g_raid_tr_raid0_object *)vol->v_tr; if (trs->trso_stopped) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); if (n + f == vol->v_disks_count) { if (f == 0) s = G_RAID_VOLUME_S_OPTIMAL; else s = G_RAID_VOLUME_S_SUBOPTIMAL; } else s = G_RAID_VOLUME_S_BROKEN; } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static int g_raid_tr_event_raid0(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { struct g_raid_tr_raid0_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; int state; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; sc = vol->v_softc; state = sd->sd_state; if (state != G_RAID_SUBDISK_S_NONE && state != G_RAID_SUBDISK_S_FAILED && state != G_RAID_SUBDISK_S_ACTIVE) { G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } if (state != sd->sd_state && !trs->trso_starting && !trs->trso_stopped) g_raid_write_metadata(sc, vol, sd, NULL); g_raid_tr_update_state_raid0(vol); return (0); } static int g_raid_tr_start_raid0(struct g_raid_tr_object *tr) { struct g_raid_tr_raid0_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid0(vol); return (0); } static int g_raid_tr_stop_raid0(struct g_raid_tr_object *tr) { struct g_raid_tr_raid0_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid0_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopped = 1; g_raid_tr_update_state_raid0(vol); return (0); } static void g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, nstripe, remain; u_int no, strip_size; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { g_raid_iodone(bp, EIO); return; } if (bp->bio_cmd == BIO_FLUSH) { g_raid_tr_flush_common(tr, bp); return; } addr = bp->bio_data; strip_size = vol->v_strip_size; /* Stripe number. */ nstripe = bp->bio_offset / strip_size; /* Start position in stripe. */ start = bp->bio_offset % strip_size; /* Disk number. */ no = nstripe % vol->v_disks_count; /* Stripe start position in disk. */ offset = (nstripe / vol->v_disks_count) * strip_size; /* Length of data to operate. */ remain = bp->bio_length; bioq_init(&queue); do { length = MIN(strip_size - start, remain); cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_data = addr; cbp->bio_length = length; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } remain -= length; - addr += length; + if (bp->bio_cmd != BIO_DELETE) + addr += length; start = 0; } while (remain > 0); for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static int g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; char *addr; off_t offset, start, length, nstripe, remain; u_int no, strip_size; int error; vol = tr->tro_volume; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) return (ENXIO); addr = virtual; strip_size = vol->v_strip_size; /* Stripe number. */ nstripe = boffset / strip_size; /* Start position in stripe. */ start = boffset % strip_size; /* Disk number. */ no = nstripe % vol->v_disks_count; /* Stripe tart position in disk. */ offset = (nstripe / vol->v_disks_count) * strip_size; /* Length of data to operate. */ remain = blength; do { length = MIN(strip_size - start, remain); error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], addr, 0, offset + start, length); if (error != 0) return (error); if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } remain -= length; addr += length; start = 0; } while (remain > 0); return (0); } static void g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd,struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, bp->bio_error); } } static int g_raid_tr_free_raid0(struct g_raid_tr_object *tr) { return (0); } G_RAID_TR_DECLARE(raid0, "RAID0"); Index: stable/8/sys/geom/raid/tr_raid1.c =================================================================== --- stable/8/sys/geom/raid/tr_raid1.c (revision 243678) +++ stable/8/sys/geom/raid/tr_raid1.c (revision 243679) @@ -1,998 +1,996 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" SYSCTL_DECL(_kern_geom_raid_raid1); #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; TUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size", &g_raid1_rebuild_slab); SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, &g_raid1_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; TUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io", &g_raid1_rebuild_fair_io); SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, &g_raid1_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1_REBUILD_CLUSTER_IDLE 100 static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; TUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle", &g_raid1_rebuild_cluster_idle); SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, &g_raid1_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; TUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update", &g_raid1_rebuild_meta_update); SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, &g_raid1_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); #define TR_RAID1_NONE 0 #define TR_RAID1_REBUILD 1 #define TR_RAID1_RESYNC 2 #define TR_RAID1_F_DOING_SOME 0x1 #define TR_RAID1_F_LOCKED 0x2 #define TR_RAID1_F_ABORT 0x4 struct g_raid_tr_raid1_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1; static g_raid_tr_event_t g_raid_tr_event_raid1; static g_raid_tr_start_t g_raid_tr_start_raid1; static g_raid_tr_stop_t g_raid_tr_stop_raid1; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; static g_raid_tr_locked_t g_raid_tr_locked_raid1; static g_raid_tr_idle_t g_raid_tr_idle_raid1; static g_raid_tr_free_t g_raid_tr_free_raid1; static kobj_method_t g_raid_tr_raid1_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1_class = { "RAID1", g_raid_tr_raid1_methods, sizeof(struct g_raid_tr_raid1_object), .trc_enable = 1, .trc_priority = 100 }; static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1_object *trs; struct g_raid_softc *sc; struct g_raid_subdisk *tsd, *bestsd; u_int s; int i, na, ns; sc = vol->v_softc; trs = (struct g_raid_tr_raid1_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { /* Make sure we have at least one ACTIVE disk. */ na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); if (na == 0) { /* * Critical situation! We have no any active disk! * Choose the best disk we have to make it active. */ bestsd = &vol->v_subdisks[0]; for (i = 1; i < vol->v_disks_count; i++) { tsd = &vol->v_subdisks[i]; if (tsd->sd_state > bestsd->sd_state) bestsd = tsd; else if (tsd->sd_state == bestsd->sd_state && (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = tsd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } } na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); if (na == vol->v_disks_count) s = G_RAID_VOLUME_S_OPTIMAL; else if (na + ns == vol->v_disks_count) s = G_RAID_VOLUME_S_SUBOPTIMAL; else if (na > 0) s = G_RAID_VOLUME_S_DEGRADED; else s = G_RAID_VOLUME_S_BROKEN; g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } return (0); } static void g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *good_sd; struct bio *bp; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) return; sd = trs->trso_failed_sd; good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); if (good_sd == NULL) { g_raid_tr_raid1_rebuild_abort(tr); return; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = sd->sd_rebuild_pos; bp->bio_length = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = good_sd; trs->trso_flags |= TR_RAID1_F_DOING_SOME; trs->trso_flags |= TR_RAID1_F_LOCKED; g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ bp->bio_offset, bp->bio_length, NULL, bp); } static void g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; trs->trso_type = TR_RAID1_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1(vol, NULL); } static void g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1_rebuild_done(trs); } static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; off_t len; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1_F_ABORT; if (trs->trso_flags & TR_RAID1_F_LOCKED) { trs->trso_flags &= ~TR_RAID1_F_LOCKED; len = MIN(g_raid1_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); g_raid_unlock_range(tr->tro_volume, sd->sd_rebuild_pos, len); } g_raid_tr_raid1_rebuild_done(trs); } } static void g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; struct g_raid_subdisk *sd, *fsd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No active disk to rebuild. night night."); return; } fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (fsd == NULL) { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } else { fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (fsd == NULL) fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (fsd != NULL) { fsd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(fsd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); } } } if (fsd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = fsd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", fsd->sd_volume->v_name, fsd->sd_pos, fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1_REBUILD; trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); trs->trso_meta_update = g_raid1_rebuild_meta_update; g_raid_tr_raid1_rebuild_some(tr); } static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; int na, nr; /* * If we're stopping, don't do anything. If we don't have at least one * good disk and one bad disk, we don't do anything. And if there's a * 'good disk' stored in the trs, then we're in progress and we punt. * If we make it past all these checks, we need to rebuild. */ vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_stopping) return; na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1_NONE: if (na == 0) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1_rebuild_start(tr); break; case TR_RAID1_REBUILD: if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1_rebuild_abort(tr); break; case TR_RAID1_RESYNC: break; } } static int g_raid_tr_event_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1(vol, NULL); return (0); } static int g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static struct g_raid_subdisk * g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, u_int mask) { struct g_raid_subdisk *sd, *best; int i, prio, bestprio; best = NULL; bestprio = INT_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) continue; if ((mask & (1 << i)) != 0) continue; prio = G_RAID_SUBDISK_LOAD(sd); prio += min(sd->sd_recovery, 255) << 22; prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { best = sd; bestprio = prio; } } return (best); } static void g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_subdisk *sd; struct bio *cbp; sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); KASSERT(sd != NULL, ("No active disks in volume %s.", tr->tro_volume->v_name)); cbp = g_clone_bio(bp); if (cbp == NULL) { g_raid_iodone(bp, ENOMEM); return; } g_raid_subdisk_iostart(sd, cbp); } static void g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (bp->bio_offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1_rebuild_fair_io; g_raid_tr_raid1_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1_read(tr, bp); break; case BIO_WRITE: + case BIO_DELETE: g_raid_tr_iostart_raid1_write(tr, bp); break; - case BIO_DELETE: - g_raid_iodone(bp, EIO); - break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1_object *trs; uintptr_t *mask; int error, do_write; trs = (struct g_raid_tr_raid1_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { /* * This operation is part of a rebuild or resync operation. * See what work just got done, then schedule the next bit of * work, if any. Rebuild/resync is done a little bit at a * time. Either when a timeout happens, or after we get a * bunch of I/Os to the disk (to make sure an active system * will complete in a sane amount of time). * * We are setup to do differing amounts of work for each of * these cases. so long as the slabs is smallish (less than * 50 or so, I'd guess, but that's just a WAG), we shouldn't * have any bio starvation issues. For active disks, we do * 5MB of data, for inactive ones, we do 50MB. */ if (trs->trso_type == TR_RAID1_REBUILD) { if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1_F_ABORT) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(4, bp, "rebuild read done. %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(trs->trso_failed_sd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(4, bp, "rebuild write done. Error %d", bp->bio_error); nsd = trs->trso_failed_sd; if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1_F_ABORT) { if ((trs->trso_flags & TR_RAID1_F_ABORT) == 0) { g_raid_tr_raid1_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } rebuild_round_done: nsd = trs->trso_failed_sd; trs->trso_flags &= ~TR_RAID1_F_LOCKED; g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; g_raid_tr_raid1_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1_rebuild_meta_update; } trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; g_raid_tr_raid1_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 1; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); if (pbp->bio_children == 1) do_write = 0; } /* * Find the other disk, and try to do the I/O to it. */ mask = (uintptr_t *)(&pbp->bio_driver2); if (pbp->bio_children == 1) { /* Save original subdisk. */ pbp->bio_driver1 = do_write ? sd : NULL; *mask = 0; } *mask |= 1 << sd->sd_pos; nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (pbp->bio_children == 2 && do_write) { sd->sd_recovery++; cbp->bio_caller1 = nsd; pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, cbp->bio_offset, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && pbp->bio_children > 1 && pbp->bio_driver1 != NULL) { /* * If it was a read, and bio_children is >1, then we just * recovered the data from the second drive. We should try to * write that data to the first drive if sector remapping is * enabled. A write should put the data in a new place on the * disk, remapping the bad sector. Do we need to do that by * queueing a request to the main worker thread? It doesn't * affect the return code of this current read, and can be * done at our liesure. However, to make the code simpler, it * is done syncrhonously. */ G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); cbp = g_clone_bio(pbp); if (cbp != NULL) { g_destroy_bio(bp); cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(pbp->bio_driver1, cbp); return; } } if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we agressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } if (pbp->bio_driver1 != NULL) { ((struct g_raid_subdisk *)pbp->bio_driver1) ->sd_recovery--; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, bp->bio_offset, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; - if (bp->bio_error != 0) { + if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; int error, i, ok; vol = tr->tro_volume; error = 0; ok = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_REBUILD: /* * When rebuilding, only part of this subdisk is * writable, the rest will be written as part of the * that process. */ if (offset >= sd->sd_rebuild_pos) continue; break; case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: /* * Resyncing still writes on the theory that the * resync'd disk is very close and writing it will * keep it that way better if we keep up while * resyncing. */ break; default: continue; } error = g_raid_subdisk_kerneldump(sd, virtual, physical, offset, length); if (error == 0) ok++; } return (ok > 0 ? 0 : error); } static int g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; trs->trso_fair_io = g_raid1_rebuild_fair_io; trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; if (trs->trso_type == TR_RAID1_REBUILD) g_raid_tr_raid1_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1_object *trs; trs = (struct g_raid_tr_raid1_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1, "RAID1"); Index: stable/8/sys/geom/raid/tr_raid1e.c =================================================================== --- stable/8/sys/geom/raid/tr_raid1e.c (revision 243678) +++ stable/8/sys/geom/raid/tr_raid1e.c (revision 243679) @@ -1,1238 +1,1237 @@ /*- * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_tr_if.h" #define N 2 SYSCTL_DECL(_kern_geom_raid_raid1e); #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size", &g_raid1e_rebuild_slab); SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, &g_raid1e_rebuild_slab, 0, "Amount of the disk to rebuild each read/write cycle of the rebuild."); #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io", &g_raid1e_rebuild_fair_io); SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, &g_raid1e_rebuild_fair_io, 0, "Fraction of the I/O bandwidth to use when disk busy for rebuild."); #define RAID1E_REBUILD_CLUSTER_IDLE 100 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle", &g_raid1e_rebuild_cluster_idle); SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, &g_raid1e_rebuild_cluster_idle, 0, "Number of slabs to do each time we trigger a rebuild cycle"); #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update", &g_raid1e_rebuild_meta_update); SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, &g_raid1e_rebuild_meta_update, 0, "When to update the meta data."); static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); #define TR_RAID1E_NONE 0 #define TR_RAID1E_REBUILD 1 #define TR_RAID1E_RESYNC 2 #define TR_RAID1E_F_DOING_SOME 0x1 #define TR_RAID1E_F_LOCKED 0x2 #define TR_RAID1E_F_ABORT 0x4 struct g_raid_tr_raid1e_object { struct g_raid_tr_object trso_base; int trso_starting; int trso_stopping; int trso_type; int trso_recover_slabs; /* slabs before rest */ int trso_fair_io; int trso_meta_update; int trso_flags; struct g_raid_subdisk *trso_failed_sd; /* like per volume */ void *trso_buffer; /* Buffer space */ off_t trso_lock_pos; /* Locked range start. */ off_t trso_lock_len; /* Locked range length. */ struct bio trso_bio; }; static g_raid_tr_taste_t g_raid_tr_taste_raid1e; static g_raid_tr_event_t g_raid_tr_event_raid1e; static g_raid_tr_start_t g_raid_tr_start_raid1e; static g_raid_tr_stop_t g_raid_tr_stop_raid1e; static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; static g_raid_tr_locked_t g_raid_tr_locked_raid1e; static g_raid_tr_idle_t g_raid_tr_idle_raid1e; static g_raid_tr_free_t g_raid_tr_free_raid1e; static kobj_method_t g_raid_tr_raid1e_methods[] = { KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), { 0, 0 } }; static struct g_raid_tr_class g_raid_tr_raid1e_class = { "RAID1E", g_raid_tr_raid1e_methods, sizeof(struct g_raid_tr_raid1e_object), .trc_enable = 1, .trc_priority = 200 }; static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd); static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask); static inline void V2P(struct g_raid_volume *vol, off_t virt, int *disk, off_t *offset, off_t *start) { off_t nstrip; u_int strip_size; strip_size = vol->v_strip_size; /* Strip number. */ nstrip = virt / strip_size; /* Start position in strip. */ *start = virt % strip_size; /* Disk number. */ *disk = (nstrip * N) % vol->v_disks_count; /* Strip start position in disk. */ *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; } static inline void P2V(struct g_raid_volume *vol, int disk, off_t offset, off_t *virt, int *copy) { off_t nstrip, start; u_int strip_size; strip_size = vol->v_strip_size; /* Start position in strip. */ start = offset % strip_size; /* Physical strip number. */ nstrip = (offset / strip_size) * vol->v_disks_count + disk; /* Number of physical strip (copy) inside virtual strip. */ *copy = nstrip % N; /* Offset in virtual space. */ *virt = (nstrip / N) * strip_size + start; } static int g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) return (G_RAID_TR_TASTE_FAIL); trs->trso_starting = 1; return (G_RAID_TR_TASTE_SUCCEED); } static int g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count / N; i++) { bestsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; } if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to ACTIVE.", vol->v_name, bestsd->sd_pos, g_raid_subdisk_state2str(bestsd->sd_state)); g_raid_change_subdisk_state(bestsd, G_RAID_SUBDISK_S_ACTIVE); g_raid_write_metadata(sc, vol, bestsd, bestsd->sd_disk); } worstsd = &vol->v_subdisks[i * N]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[i * N + j]; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *bestsd, *worstsd; int i, j, state, sstate; sc = vol->v_softc; if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == vol->v_disks_count) return (G_RAID_VOLUME_S_OPTIMAL); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { /* We found reasonable candidate. */ G_RAID_DEBUG1(1, sc, "Promote subdisk %s:%d from %s to STALE.", vol->v_name, sd->sd_pos, g_raid_subdisk_state2str(sd->sd_state)); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); g_raid_write_metadata(sc, vol, sd, sd->sd_disk); } } state = G_RAID_VOLUME_S_OPTIMAL; for (i = 0; i < vol->v_disks_count; i++) { bestsd = &vol->v_subdisks[i]; worstsd = &vol->v_subdisks[i]; for (j = 1; j < N; j++) { sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; if (sd->sd_state > bestsd->sd_state) bestsd = sd; else if (sd->sd_state == bestsd->sd_state && (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) bestsd = sd; if (sd->sd_state < worstsd->sd_state) worstsd = sd; } if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) sstate = G_RAID_VOLUME_S_OPTIMAL; else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_SUBOPTIMAL; else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) sstate = G_RAID_VOLUME_S_DEGRADED; else sstate = G_RAID_VOLUME_S_BROKEN; if (sstate < state) state = sstate; } return (state); } static int g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, struct g_raid_subdisk *sd) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; u_int s; sc = vol->v_softc; trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; if (trs->trso_stopping && (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) s = G_RAID_VOLUME_S_STOPPED; else if (trs->trso_starting) s = G_RAID_VOLUME_S_STARTING; else { if ((vol->v_disks_count % N) == 0) s = g_raid_tr_update_state_raid1e_even(vol); else s = g_raid_tr_update_state_raid1e_odd(vol); } if (s != vol->v_state) { g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); g_raid_change_volume_state(vol, s); if (!trs->trso_starting && !trs->trso_stopping) g_raid_write_metadata(sc, vol, NULL, NULL); } if (!trs->trso_starting && !trs->trso_stopping) g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); return (0); } static void g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { struct g_raid_volume *vol; vol = sd->sd_volume; /* * We don't fail the last disk in the pack, since it still has decent * data on it and that's better than failing the disk if it is the root * file system. * * XXX should this be controlled via a tunable? It makes sense for * the volume that has / on it. I can't think of a case where we'd * want the volume to go away on this kind of event. */ if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < vol->v_disks_count) && (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) return; g_raid_fail_disk(sc, sd, disk); } static void g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; vol = trs->trso_base.tro_volume; sd = trs->trso_failed_sd; g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; trs->trso_type = TR_RAID1E_NONE; trs->trso_recover_slabs = 0; trs->trso_failed_sd = NULL; g_raid_tr_update_state_raid1e(vol, NULL); } static void g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; G_RAID_DEBUG1(0, tr->tro_volume->v_softc, "Subdisk %s:%d-%s rebuild completed.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); sd->sd_rebuild_pos = 0; g_raid_tr_raid1e_rebuild_done(trs); } static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; sd = trs->trso_failed_sd; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { G_RAID_DEBUG1(1, vol->v_softc, "Subdisk %s:%d-%s rebuild is aborting.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags |= TR_RAID1E_F_ABORT; } else { G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild aborted.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); trs->trso_flags &= ~TR_RAID1E_F_ABORT; if (trs->trso_flags & TR_RAID1E_F_LOCKED) { trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); } g_raid_tr_raid1e_rebuild_done(trs); } } static void g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio *bp; off_t len, virtual, vend, offset, start; int disk, copy, best; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) return; vol = tr->tro_volume; sc = vol->v_softc; sd = trs->trso_failed_sd; while (1) { if (sd->sd_rebuild_pos >= sd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Get virtual offset from physical rebuild position. */ P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); /* Get physical offset back to get first stripe position. */ V2P(vol, virtual, &disk, &offset, &start); /* Calculate contignous data length. */ len = MIN(g_raid1e_rebuild_slab, sd->sd_size - sd->sd_rebuild_pos); if ((vol->v_disks_count % N) != 0) len = MIN(len, vol->v_strip_size - start); /* Find disk with most accurate data. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset + start, len, 0); if (best < 0) { /* There is no any valid disk. */ g_raid_tr_raid1e_rebuild_abort(tr); return; } else if (best != copy) { /* Some other disk has better data. */ break; } /* We have the most accurate data. Skip the range. */ G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); sd->sd_rebuild_pos += len; } bp = &trs->trso_bio; memset(bp, 0, sizeof(*bp)); bp->bio_offset = offset + start + ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); bp->bio_length = len; bp->bio_data = trs->trso_buffer; bp->bio_cmd = BIO_READ; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); /* * If we are crossing stripe boundary, correct affected virtual * range we should lock. */ if (start + len > vol->v_strip_size) { P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); len = vend - virtual; } trs->trso_flags |= TR_RAID1E_F_DOING_SOME; trs->trso_flags |= TR_RAID1E_F_LOCKED; trs->trso_lock_pos = virtual; trs->trso_lock_len = len; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); } static void g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; struct g_raid_subdisk *sd; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_failed_sd) { G_RAID_DEBUG1(1, vol->v_softc, "Already rebuild in start rebuild. pos %jd\n", (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); return; } sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); if (sd == NULL) { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } else { sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (sd == NULL) sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_NEW); if (sd != NULL) { sd->sd_rebuild_pos = 0; g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); g_raid_write_metadata(vol->v_softc, vol, sd, NULL); } } } if (sd == NULL) { G_RAID_DEBUG1(1, vol->v_softc, "No failed disk to rebuild. night night."); return; } trs->trso_failed_sd = sd; G_RAID_DEBUG1(0, vol->v_softc, "Subdisk %s:%d-%s rebuild start at %jd.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", trs->trso_failed_sd->sd_rebuild_pos); trs->trso_type = TR_RAID1E_REBUILD; trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); trs->trso_meta_update = g_raid1e_rebuild_meta_update; g_raid_tr_raid1e_rebuild_some(tr); } static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; int nr; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_stopping) return; nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); switch(trs->trso_type) { case TR_RAID1E_NONE: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) return; if (nr == 0) { nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); if (nr == 0) return; } g_raid_tr_raid1e_rebuild_start(tr); break; case TR_RAID1E_REBUILD: if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || trs->trso_failed_sd == sd) g_raid_tr_raid1e_rebuild_abort(tr); break; case TR_RAID1E_RESYNC: break; } } static int g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, u_int event) { g_raid_tr_update_state_raid1e(tr->tro_volume, sd); return (0); } static int g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } static int g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; trs->trso_starting = 0; trs->trso_stopping = 1; g_raid_tr_update_state_raid1e(vol, NULL); return (0); } /* * Select the disk to read from. Take into account: subdisk state, running * error recovery, average disk load, head position and possible cache hits. */ #define ABS(x) (((x) >= 0) ? (x) : (-(x))) static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, int no, off_t off, off_t len, u_int mask) { struct g_raid_subdisk *sd; off_t offset; int i, best, prio, bestprio; best = -1; bestprio = INT_MAX; for (i = 0; i < N; i++) { sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; offset = off; if (no + i >= vol->v_disks_count) offset += vol->v_strip_size; prio = G_RAID_SUBDISK_LOAD(sd); if ((mask & (1 << sd->sd_pos)) != 0) continue; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: break; case G_RAID_SUBDISK_S_RESYNC: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ case G_RAID_SUBDISK_S_STALE: prio += i << 24; break; case G_RAID_SUBDISK_S_REBUILD: if (offset + off < sd->sd_rebuild_pos) break; /* FALLTHROUGH */ default: continue; } prio += min(sd->sd_recovery, 255) << 16; /* If disk head is precisely in position - highly prefer it. */ if (G_RAID_SUBDISK_POS(sd) == offset) prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; else /* If disk head is close to position - prefer it. */ if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < G_RAID_SUBDISK_TRACK_SIZE) prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; if (prio < bestprio) { bestprio = prio; best = i; } } return (best); } static void g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int best; vol = tr->tro_volume; addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); best = g_raid_tr_raid1e_select_read_disk(vol, no, offset, length, 0); KASSERT(best >= 0, ("No readable disk in volume %s!", vol->v_name)); no += best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_data = addr; cbp->bio_length = length; cbp->bio_caller1 = &vol->v_subdisks[no]; bioq_insert_tail(&queue, cbp); no += N - best; if (no >= vol->v_disks_count) { no -= vol->v_disks_count; offset += strip_size; } remain -= length; addr += length; start = 0; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i; vol = tr->tro_volume; addr = bp->bio_data; strip_size = vol->v_strip_size; V2P(vol, bp->bio_offset, &no, &offset, &start); remain = bp->bio_length; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_offset = offset + start; cbp->bio_data = addr; cbp->bio_length = length; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; - addr += length; + if (bp->bio_cmd != BIO_DELETE) + addr += length; start = 0; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_tr_raid1e_object *trs; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && vol->v_state != G_RAID_VOLUME_S_DEGRADED) { g_raid_iodone(bp, EIO); return; } /* * If we're rebuilding, squeeze in rebuild activity every so often, * even when the disk is busy. Be sure to only count real I/O * to the disk. All 'SPECIAL' I/O is traffic generated to the disk * by this module. */ if (trs->trso_failed_sd != NULL && !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { /* Make this new or running now round short. */ trs->trso_recover_slabs = 0; if (--trs->trso_fair_io <= 0) { trs->trso_fair_io = g_raid1e_rebuild_fair_io; g_raid_tr_raid1e_rebuild_some(tr); } } switch (bp->bio_cmd) { case BIO_READ: g_raid_tr_iostart_raid1e_read(tr, bp); break; case BIO_WRITE: + case BIO_DELETE: g_raid_tr_iostart_raid1e_write(tr, bp); break; - case BIO_DELETE: - g_raid_iodone(bp, EIO); - break; case BIO_FLUSH: g_raid_tr_flush_common(tr, bp); break; default: KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", bp->bio_cmd, vol->v_name)); break; } } static void g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, struct g_raid_subdisk *sd, struct bio *bp) { struct bio *cbp; struct g_raid_subdisk *nsd; struct g_raid_volume *vol; struct bio *pbp; struct g_raid_tr_raid1e_object *trs; off_t virtual, offset, start; uintptr_t mask; int error, do_write, copy, disk, best; trs = (struct g_raid_tr_raid1e_object *)tr; vol = tr->tro_volume; if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { if (trs->trso_type == TR_RAID1E_REBUILD) { nsd = trs->trso_failed_sd; if (bp->bio_cmd == BIO_READ) { /* Immediately abort rebuild, if requested. */ if (trs->trso_flags & TR_RAID1E_F_ABORT) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } /* On read error, skip and cross fingers. */ if (bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Read error during rebuild (%d), " "possible data loss!", bp->bio_error); goto rebuild_round_done; } /* * The read operation finished, queue the * write and get out. */ G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", bp->bio_error); bp->bio_cmd = BIO_WRITE; bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; bp->bio_offset = nsd->sd_rebuild_pos; G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); g_raid_subdisk_iostart(nsd, bp); } else { /* * The write operation just finished. Do * another. We keep cloning the master bio * since it has the right buffers allocated to * it. */ G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", bp->bio_error); if (bp->bio_error != 0 || trs->trso_flags & TR_RAID1E_F_ABORT) { if ((trs->trso_flags & TR_RAID1E_F_ABORT) == 0) { g_raid_tr_raid1e_fail_disk(sd->sd_softc, nsd, nsd->sd_disk); } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } rebuild_round_done: trs->trso_flags &= ~TR_RAID1E_F_LOCKED; g_raid_unlock_range(tr->tro_volume, trs->trso_lock_pos, trs->trso_lock_len); nsd->sd_rebuild_pos += bp->bio_length; if (nsd->sd_rebuild_pos >= nsd->sd_size) { g_raid_tr_raid1e_rebuild_finish(tr); return; } /* Abort rebuild if we are stopping */ if (trs->trso_stopping) { trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; g_raid_tr_raid1e_rebuild_abort(tr); return; } if (--trs->trso_meta_update <= 0) { g_raid_write_metadata(vol->v_softc, vol, nsd, nsd->sd_disk); trs->trso_meta_update = g_raid1e_rebuild_meta_update; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_meta_update *= g_raid1e_rebuild_slab; trs->trso_meta_update /= vol->v_strip_size; } } trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; if (--trs->trso_recover_slabs <= 0) return; /* Run next rebuild iteration. */ g_raid_tr_raid1e_rebuild_some(tr); } } else if (trs->trso_type == TR_RAID1E_RESYNC) { /* * read good sd, read bad sd in parallel. when both * done, compare the buffers. write good to the bad * if different. do the next bit of work. */ panic("Somehow, we think we're doing a resync"); } return; } pbp = bp->bio_parent; pbp->bio_inbed++; mask = (intptr_t)bp->bio_caller2; if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { /* * Read failed on first drive. Retry the read error on * another disk drive, if available, before erroring out the * read. */ sd->sd_disk->d_read_errs++; G_RAID_LOGREQ(0, bp, "Read error (%d), %d read errors total", bp->bio_error, sd->sd_disk->d_read_errs); /* * If there are too many read errors, we move to degraded. * XXX Do we want to FAIL the drive (eg, make the user redo * everything to get it back in sync), or just degrade the * drive, which kicks off a resync? */ do_write = 0; if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); else if (mask == 0) do_write = 1; /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find the other disk, and try to do the I/O to it. */ mask |= 1 << copy; best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_length = bp->bio_length; cbp->bio_data = bp->bio_data; g_destroy_bio(bp); nsd = &vol->v_subdisks[disk]; G_RAID_LOGREQ(2, cbp, "Retrying read from %d", nsd->sd_pos); if (do_write) mask |= 1 << 31; if ((mask & (1 << 31)) != 0) sd->sd_recovery++; cbp->bio_caller2 = (void *)mask; if (do_write) { cbp->bio_caller1 = nsd; /* Lock callback starts I/O */ g_raid_lock_range(sd->sd_volume, virtual, cbp->bio_length, pbp, cbp); } else { g_raid_subdisk_iostart(nsd, cbp); } return; } /* * We can't retry. Return the original error by falling * through. This will happen when there's only one good disk. * We don't need to fail the raid, since its actual state is * based on the state of the subdisks. */ G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); } if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 && (mask & (1 << 31)) != 0) { G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); /* Find best disk to write. */ best = g_raid_tr_raid1e_select_read_disk(vol, disk, offset, start, ~mask); if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { disk += best; if (disk >= vol->v_disks_count) { disk -= vol->v_disks_count; offset += vol->v_strip_size; } cbp->bio_offset = offset + start; cbp->bio_length = bp->bio_length; cbp->bio_data = bp->bio_data; cbp->bio_cmd = BIO_WRITE; cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; cbp->bio_caller2 = (void *)mask; g_destroy_bio(bp); G_RAID_LOGREQ(2, cbp, "Attempting bad sector remap on failing drive."); g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); return; } } if ((mask & (1 << 31)) != 0) { /* * We're done with a recovery, mark the range as unlocked. * For any write errors, we agressively fail the disk since * there was both a READ and a WRITE error at this location. * Both types of errors generally indicates the drive is on * the verge of total failure anyway. Better to stop trusting * it now. However, we need to reset error to 0 in that case * because we're not failing the original I/O which succeeded. */ /* Restore what we were doing. */ P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); V2P(vol, virtual, &disk, &offset, &start); for (copy = 0; copy < N; copy++) { if ((mask & (1 << copy) ) != 0) vol->v_subdisks[(disk + copy) % vol->v_disks_count].sd_recovery--; } if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { G_RAID_LOGREQ(0, bp, "Remap write failed: " "failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); bp->bio_error = 0; } G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); } if (pbp->bio_cmd != BIO_READ) { if (pbp->bio_inbed == 1 || pbp->bio_error != 0) pbp->bio_error = bp->bio_error; - if (bp->bio_error != 0) { + if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); } error = pbp->bio_error; } else error = bp->bio_error; g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_raid_iodone(pbp, error); } } static int g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t boffset, size_t blength) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; char *addr; off_t offset, start, length, remain; u_int no, strip_size; int i, error; vol = tr->tro_volume; addr = virtual; strip_size = vol->v_strip_size; V2P(vol, boffset, &no, &offset, &start); remain = blength; bioq_init(&queue); while (remain > 0) { length = MIN(strip_size - start, remain); for (i = 0; i < N; i++) { sd = &vol->v_subdisks[no]; switch (sd->sd_state) { case G_RAID_SUBDISK_S_ACTIVE: case G_RAID_SUBDISK_S_STALE: case G_RAID_SUBDISK_S_RESYNC: break; case G_RAID_SUBDISK_S_REBUILD: if (offset + start >= sd->sd_rebuild_pos) goto nextdisk; break; default: goto nextdisk; } error = g_raid_subdisk_kerneldump(sd, addr, 0, offset + start, length); if (error != 0) return (error); nextdisk: if (++no >= vol->v_disks_count) { no = 0; offset += strip_size; } } remain -= length; addr += length; start = 0; } return (0); } static int g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) { struct bio *bp; struct g_raid_subdisk *sd; bp = (struct bio *)argp; sd = (struct g_raid_subdisk *)bp->bio_caller1; g_raid_subdisk_iostart(sd, bp); return (0); } static int g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; struct g_raid_volume *vol; vol = tr->tro_volume; trs = (struct g_raid_tr_raid1e_object *)tr; trs->trso_fair_io = g_raid1e_rebuild_fair_io; trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; /* Compensate short rebuild I/Os. */ if ((vol->v_disks_count % N) != 0 && vol->v_strip_size < g_raid1e_rebuild_slab) { trs->trso_recover_slabs *= g_raid1e_rebuild_slab; trs->trso_recover_slabs /= vol->v_strip_size; } if (trs->trso_type == TR_RAID1E_REBUILD) g_raid_tr_raid1e_rebuild_some(tr); return (0); } static int g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) { struct g_raid_tr_raid1e_object *trs; trs = (struct g_raid_tr_raid1e_object *)tr; if (trs->trso_buffer != NULL) { free(trs->trso_buffer, M_TR_RAID1E); trs->trso_buffer = NULL; } return (0); } G_RAID_TR_DECLARE(raid1e, "RAID1E"); Index: stable/8/sys/geom =================================================================== --- stable/8/sys/geom (revision 243678) +++ stable/8/sys/geom (revision 243679) Property changes on: stable/8/sys/geom ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/geom:r242323,242328 Index: stable/8/sys =================================================================== --- stable/8/sys (revision 243678) +++ stable/8/sys (revision 243679) Property changes on: stable/8/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r242323,242328