diff --git a/sys/geom/bde/g_bde.h b/sys/geom/bde/g_bde.h index 3f36bb4281ae..2b9cc0532ef9 100644 --- a/sys/geom/bde/g_bde.h +++ b/sys/geom/bde/g_bde.h @@ -1,212 +1,217 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_GEOM_BDE_G_BDE_H_ #define _SYS_GEOM_BDE_G_BDE_H_ 1 /* * These are quite, but not entirely unlike constants. * * They are not commented in details here, to prevent unadvisable * experimentation. Please consult the code where they are used before you * even think about modifying these. */ #define G_BDE_MKEYLEN (2048/8) #define G_BDE_SKEYBITS 128 #define G_BDE_SKEYLEN (G_BDE_SKEYBITS/8) #define G_BDE_KKEYBITS 128 #define G_BDE_KKEYLEN (G_BDE_KKEYBITS/8) #define G_BDE_MAXKEYS 4 #define G_BDE_LOCKSIZE 384 #define NLOCK_FIELDS 13 /* This just needs to be "large enough" */ #define G_BDE_KEYBYTES 304 +/* This file is being included by userspace. */ +#ifndef __diagused +#define __diagused +#endif + struct g_bde_work; struct g_bde_softc; struct g_bde_sector { struct g_bde_work *owner; struct g_bde_softc *softc; off_t offset; u_int size; u_int ref; void *data; TAILQ_ENTRY(g_bde_sector) list; u_char valid; u_char malloc; enum {JUNK, IO, VALID} state; int error; time_t used; }; struct g_bde_work { struct mtx mutex; off_t offset; off_t length; void *data; struct bio *bp; struct g_bde_softc *softc; off_t so; off_t kso; u_int ko; struct g_bde_sector *sp; struct g_bde_sector *ksp; TAILQ_ENTRY(g_bde_work) list; enum {SETUP, WAIT, FINISH} state; int error; }; /* * The decrypted contents of the lock sectors. Notice that this is not * the same as the on-disk layout. The on-disk layout is dynamic and * dependent on the pass-phrase. */ struct g_bde_key { uint64_t sector0; /* Physical byte offset of 1st byte used */ uint64_t sectorN; /* Physical byte offset of 1st byte not used */ uint64_t keyoffset; /* Number of bytes the disk image is skewed. */ uint64_t lsector[G_BDE_MAXKEYS]; /* Physical byte offsets of lock sectors */ uint32_t sectorsize; /* Our "logical" sector size */ uint32_t flags; #define GBDE_F_SECT0 1 uint8_t salt[16]; /* Used to frustate the kkey generation */ uint8_t spare[32]; /* For future use, random contents */ uint8_t mkey[G_BDE_MKEYLEN]; /* Our masterkey. */ /* Non-stored help-fields */ uint64_t zone_width; /* On-disk width of zone */ uint64_t zone_cont; /* Payload width of zone */ uint64_t media_width; /* Non-magic width of zone */ u_int keys_per_sector; }; struct g_bde_softc { off_t mediasize; u_int sectorsize; uint64_t zone_cont; struct g_geom *geom; struct g_consumer *consumer; TAILQ_HEAD(, g_bde_sector) freelist; TAILQ_HEAD(, g_bde_work) worklist; struct mtx worklist_mutex; struct proc *thread; struct g_bde_key key; int dead; u_int nwork; u_int nsect; u_int ncache; u_char sha2[SHA512_DIGEST_LENGTH]; }; /* g_bde_crypt.c */ void g_bde_crypt_delete(struct g_bde_work *wp); void g_bde_crypt_read(struct g_bde_work *wp); void g_bde_crypt_write(struct g_bde_work *wp); /* g_bde_key.c */ void g_bde_zap_key(struct g_bde_softc *sc); int g_bde_get_key(struct g_bde_softc *sc, void *ptr, int len); int g_bde_init_keybytes(struct g_bde_softc *sc, char *passp, int len); /* g_bde_lock .c */ int g_bde_encode_lock(u_char *sha2, struct g_bde_key *gl, u_char *ptr); int g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr); int g_bde_keyloc_encrypt(u_char *sha2, uint64_t v0, uint64_t v1, void *output); int g_bde_keyloc_decrypt(u_char *sha2, void *input, uint64_t *output); int g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey); void g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len); /* g_bde_math .c */ uint64_t g_bde_max_sector(struct g_bde_key *lp); void g_bde_map_sector(struct g_bde_work *wp); /* g_bde_work.c */ void g_bde_start1(struct bio *bp); void g_bde_worker(void *arg); /* * These four functions wrap the raw Rijndael functions and make sure we * explode if something fails which shouldn't. */ static __inline void AES_init(cipherInstance *ci) { - int error; + int error __diagused; error = rijndael_cipherInit(ci, MODE_CBC, NULL); KASSERT(error > 0, ("rijndael_cipherInit %d", error)); } static __inline void AES_makekey(keyInstance *ki, int dir, u_int len, const void *key) { - int error; + int error __diagused; error = rijndael_makeKey(ki, dir, len, key); KASSERT(error > 0, ("rijndael_makeKey %d", error)); } static __inline void AES_encrypt(cipherInstance *ci, keyInstance *ki, const void *in, void *out, u_int len) { - int error; + int error __diagused; error = rijndael_blockEncrypt(ci, ki, in, len * 8, out); KASSERT(error > 0, ("rijndael_blockEncrypt %d", error)); } static __inline void AES_decrypt(cipherInstance *ci, keyInstance *ki, const void *in, void *out, u_int len) { - int error; + int error __diagused; error = rijndael_blockDecrypt(ci, ki, in, len * 8, out); KASSERT(error > 0, ("rijndael_blockDecrypt %d", error)); } #endif /* _SYS_GEOM_BDE_G_BDE_H_ */ diff --git a/sys/geom/journal/g_journal.c b/sys/geom/journal/g_journal.c index 954d0dbf2c6b..60c2aeb4fad6 100644 --- a/sys/geom/journal/g_journal.c +++ b/sys/geom/journal/g_journal.c @@ -1,3022 +1,3022 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GJ_MEMDEBUG #include #include #endif #include #include #include #include #include FEATURE(geom_journal, "GEOM journaling support"); /* * On-disk journal format: * * JH - Journal header * RH - Record header * * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ... * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% * */ CTASSERT(sizeof(struct g_journal_header) <= 512); CTASSERT(sizeof(struct g_journal_record_header) <= 512); static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data"); static struct mtx g_journal_cache_mtx; MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF); const struct g_journal_desc *g_journal_filesystems[] = { &g_journal_ufs, NULL }; SYSCTL_DECL(_kern_geom); int g_journal_debug = 0; static u_int g_journal_switch_time = 10; static u_int g_journal_force_switch = 70; static u_int g_journal_parallel_flushes = 16; static u_int g_journal_parallel_copies = 16; static u_int g_journal_accept_immediately = 64; static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES; static u_int g_journal_do_optimize = 1; static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_JOURNAL stuff"); SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW, &g_journal_switch_time, 0, "Switch journals every N seconds"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW, &g_journal_force_switch, 0, "Force switch when journal is N% full"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW, &g_journal_parallel_flushes, 0, "Number of flush I/O requests to send in parallel"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW, &g_journal_accept_immediately, 0, "Number of I/O requests accepted immediately"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW, &g_journal_parallel_copies, 0, "Number of copy I/O requests to send in parallel"); static int g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS) { u_int entries; int error; entries = g_journal_record_entries; error = sysctl_handle_int(oidp, &entries, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); g_journal_record_entries = entries; return (0); } SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, g_journal_record_entries_sysctl, "I", "Maximum number of entires in one journal record"); SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW, &g_journal_do_optimize, 0, "Try to combine bios on flush and copy"); static u_long g_journal_cache_used = 0; static u_long g_journal_cache_limit = 64 * 1024 * 1024; static u_int g_journal_cache_divisor = 2; static u_int g_journal_cache_switch = 90; static u_int g_journal_cache_misses = 0; static u_int g_journal_cache_alloc_failures = 0; static u_long g_journal_cache_low = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_JOURNAL cache"); SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD, &g_journal_cache_used, 0, "Number of allocated bytes"); static int g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS) { u_long limit; int error; limit = g_journal_cache_limit; error = sysctl_handle_long(oidp, &limit, 0, req); if (error != 0 || req->newptr == NULL) return (error); g_journal_cache_limit = limit; g_journal_cache_low = (limit / 100) * g_journal_cache_switch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, g_journal_cache_limit_sysctl, "I", "Maximum number of allocated bytes"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN, &g_journal_cache_divisor, 0, "(kmem_size / kern.geom.journal.cache.divisor) == cache size"); static int g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS) { u_int cswitch; int error; cswitch = g_journal_cache_switch; error = sysctl_handle_int(oidp, &cswitch, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (cswitch > 100) return (EINVAL); g_journal_cache_switch = cswitch; g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch; return (0); } SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, g_journal_cache_switch_sysctl, "I", "Force switch when we hit this percent of cache use"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW, &g_journal_cache_misses, 0, "Number of cache misses"); SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW, &g_journal_cache_alloc_failures, 0, "Memory allocation failures"); static u_long g_journal_stats_bytes_skipped = 0; static u_long g_journal_stats_combined_ios = 0; static u_long g_journal_stats_switches = 0; static u_long g_journal_stats_wait_for_copy = 0; static u_long g_journal_stats_journal_full = 0; static u_long g_journal_stats_low_mem = 0; static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_JOURNAL statistics"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW, &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW, &g_journal_stats_combined_ios, 0, "Number of combined I/O requests"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW, &g_journal_stats_switches, 0, "Number of journal switches"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW, &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch"); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW, &g_journal_stats_journal_full, 0, "Number of times journal was almost full."); SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW, &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called."); static g_taste_t g_journal_taste; static g_ctl_req_t g_journal_config; static g_dumpconf_t g_journal_dumpconf; static g_init_t g_journal_init; static g_fini_t g_journal_fini; struct g_class g_journal_class = { .name = G_JOURNAL_CLASS_NAME, .version = G_VERSION, .taste = g_journal_taste, .ctlreq = g_journal_config, .dumpconf = g_journal_dumpconf, .init = g_journal_init, .fini = g_journal_fini }; static int g_journal_destroy(struct g_journal_softc *sc); static void g_journal_metadata_update(struct g_journal_softc *sc); static void g_journal_start_switcher(struct g_class *mp); static void g_journal_stop_switcher(void); static void g_journal_switch_wait(struct g_journal_softc *sc); #define GJ_SWITCHER_WORKING 0 #define GJ_SWITCHER_DIE 1 #define GJ_SWITCHER_DIED 2 static struct proc *g_journal_switcher_proc = NULL; static int g_journal_switcher_state = GJ_SWITCHER_WORKING; static int g_journal_switcher_wokenup = 0; static int g_journal_sync_requested = 0; #ifdef GJ_MEMDEBUG struct meminfo { size_t mi_size; struct stack mi_stack; }; #endif /* * We use our own malloc/realloc/free functions, so we can collect statistics * and force journal switch when we're running out of cache. */ static void * gj_malloc(size_t size, int flags) { void *p; #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif mtx_lock(&g_journal_cache_mtx); if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup && g_journal_cache_used + size > g_journal_cache_low) { GJ_DEBUG(1, "No cache, waking up the switcher."); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); } if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 && g_journal_cache_used + size > g_journal_cache_limit) { mtx_unlock(&g_journal_cache_mtx); g_journal_cache_alloc_failures++; return (NULL); } g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); flags &= ~M_NOWAIT; #ifndef GJ_MEMDEBUG p = malloc(size, M_JOURNAL, flags | M_WAITOK); #else mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK); p = (u_char *)mi + sizeof(*mi); mi->mi_size = size; stack_save(&mi->mi_stack); #endif return (p); } static void gj_free(void *p, size_t size) { #ifdef GJ_MEMDEBUG struct meminfo *mi; #endif KASSERT(p != NULL, ("p=NULL")); KASSERT(size > 0, ("size=0")); mtx_lock(&g_journal_cache_mtx); KASSERT(g_journal_cache_used >= size, ("Freeing too much?")); g_journal_cache_used -= size; mtx_unlock(&g_journal_cache_mtx); #ifdef GJ_MEMDEBUG mi = p = (void *)((u_char *)p - sizeof(*mi)); if (mi->mi_size != size) { printf("GJOURNAL: Size mismatch! %zu != %zu\n", size, mi->mi_size); printf("GJOURNAL: Alloc backtrace:\n"); stack_print(&mi->mi_stack); printf("GJOURNAL: Free backtrace:\n"); kdb_backtrace(); } #endif free(p, M_JOURNAL); } static void * gj_realloc(void *p, size_t size, size_t oldsize) { void *np; #ifndef GJ_MEMDEBUG mtx_lock(&g_journal_cache_mtx); g_journal_cache_used -= oldsize; g_journal_cache_used += size; mtx_unlock(&g_journal_cache_mtx); np = realloc(p, size, M_JOURNAL, M_WAITOK); #else np = gj_malloc(size, M_WAITOK); bcopy(p, np, MIN(oldsize, size)); gj_free(p, oldsize); #endif return (np); } static void g_journal_check_overflow(struct g_journal_softc *sc) { off_t length, used; if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset) || (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset && sc->sc_journal_offset >= sc->sc_inactive.jj_offset && sc->sc_journal_offset < sc->sc_active.jj_offset)) { panic("Journal overflow " "(id = %u joffset=%jd active=%jd inactive=%jd)", (unsigned)sc->sc_id, (intmax_t)sc->sc_journal_offset, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset); } if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) { length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset; used = sc->sc_journal_offset - sc->sc_active.jj_offset; } else { length = sc->sc_jend - sc->sc_active.jj_offset; length += sc->sc_inactive.jj_offset - sc->sc_jstart; if (sc->sc_journal_offset >= sc->sc_active.jj_offset) used = sc->sc_journal_offset - sc->sc_active.jj_offset; else { used = sc->sc_jend - sc->sc_active.jj_offset; used += sc->sc_journal_offset - sc->sc_jstart; } } /* Already woken up? */ if (g_journal_switcher_wokenup) return; /* * If the active journal takes more than g_journal_force_switch precent * of free journal space, we force journal switch. */ KASSERT(length > 0, ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd", (intmax_t)length, (intmax_t)used, (intmax_t)sc->sc_active.jj_offset, (intmax_t)sc->sc_inactive.jj_offset, (intmax_t)sc->sc_journal_offset)); if ((used * 100) / length > g_journal_force_switch) { g_journal_stats_journal_full++; GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.", sc->sc_name, (used * 100) / length); mtx_lock(&g_journal_cache_mtx); g_journal_switcher_wokenup = 1; wakeup(&g_journal_switcher_state); mtx_unlock(&g_journal_cache_mtx); } } static void g_journal_orphan(struct g_consumer *cp) { struct g_journal_softc *sc; char name[256]; int error; g_topology_assert(); sc = cp->geom->softc; strlcpy(name, cp->provider->name, sizeof(name)); GJ_DEBUG(0, "Lost provider %s.", name); if (sc == NULL) return; error = g_journal_destroy(sc); if (error == 0) GJ_DEBUG(0, "Journal %s destroyed.", name); else { GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). " "Destroy it manually after last close.", sc->sc_name, error); } } static int g_journal_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_journal_softc *sc; int dcw; g_topology_assert(); GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcw = pp->acw + acw; sc = pp->geom->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); else return (ENXIO); } if (pp->acw == 0 && dcw > 0) { GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name); sc->sc_flags &= ~GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); sc->sc_flags |= GJF_DEVICE_CLEAN; g_topology_unlock(); g_journal_metadata_update(sc); g_topology_lock(); } */ return (0); } static void g_journal_header_encode(struct g_journal_header *hdr, u_char *data) { bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC)); data += sizeof(GJ_HEADER_MAGIC); le32enc(data, hdr->jh_journal_id); data += 4; le32enc(data, hdr->jh_journal_next_id); } static int g_journal_header_decode(const u_char *data, struct g_journal_header *hdr) { bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic)); data += sizeof(hdr->jh_magic); if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0) return (EINVAL); hdr->jh_journal_id = le32dec(data); data += 4; hdr->jh_journal_next_id = le32dec(data); return (0); } static void g_journal_flush_cache(struct g_journal_softc *sc) { struct bintime bt; int error; if (sc->sc_bio_flush == 0) return; GJ_TIMER_START(1, &bt); if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) { error = g_io_flush(sc->sc_jconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_jconsumer->provider->name, error); } if (sc->sc_bio_flush & GJ_FLUSH_DATA) { /* * TODO: This could be called in parallel with the * previous call. */ error = g_io_flush(sc->sc_dconsumer); GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", sc->sc_dconsumer->provider->name, error); } GJ_TIMER_STOP(1, &bt, "Cache flush time"); } static int g_journal_write_header(struct g_journal_softc *sc) { struct g_journal_header hdr; struct g_consumer *cp; u_char *buf; int error; cp = sc->sc_jconsumer; buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic)); hdr.jh_journal_id = sc->sc_journal_id; hdr.jh_journal_next_id = sc->sc_journal_next_id; g_journal_header_encode(&hdr, buf); error = g_write_data(cp, sc->sc_journal_offset, buf, cp->provider->sectorsize); /* if (error == 0) */ sc->sc_journal_offset += cp->provider->sectorsize; gj_free(buf, cp->provider->sectorsize); return (error); } /* * Every journal record has a header and data following it. * Functions below are used to decode the header before storing it to * little endian and to encode it after reading to system endianness. */ static void g_journal_record_header_encode(struct g_journal_record_header *hdr, u_char *data) { struct g_journal_entry *ent; u_int i; bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC)); data += sizeof(GJ_RECORD_HEADER_MAGIC); le32enc(data, hdr->jrh_journal_id); data += 8; le16enc(data, hdr->jrh_nentries); data += 2; bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; le64enc(data, ent->je_joffset); data += 8; le64enc(data, ent->je_offset); data += 8; le64enc(data, ent->je_length); data += 8; } } static int g_journal_record_header_decode(const u_char *data, struct g_journal_record_header *hdr) { struct g_journal_entry *ent; u_int i; bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic)); data += sizeof(hdr->jrh_magic); if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0) return (EINVAL); hdr->jrh_journal_id = le32dec(data); data += 8; hdr->jrh_nentries = le16dec(data); data += 2; if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES) return (EINVAL); bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum)); data += 8; for (i = 0; i < hdr->jrh_nentries; i++) { ent = &hdr->jrh_entries[i]; ent->je_joffset = le64dec(data); data += 8; ent->je_offset = le64dec(data); data += 8; ent->je_length = le64dec(data); data += 8; } return (0); } /* * Function reads metadata from a provider (via the given consumer), decodes * it to system endianness and verifies its correctness. */ static int g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); /* Metadata is stored in last sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) { GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).", cp->provider->name, error); return (error); } /* Decode metadata. */ error = journal_metadata_decode(buf, md); g_free(buf); /* Is this is gjournal provider at all? */ if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0) return (EINVAL); /* * Are we able to handle this version of metadata? * We only maintain backward compatibility. */ if (md->md_version > G_JOURNAL_VERSION) { GJ_DEBUG(0, "Kernel module is too old to handle metadata from %s.", cp->provider->name); return (EINVAL); } /* Is checksum correct? */ if (error != 0) { GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.", cp->provider->name); return (error); } return (0); } /* * Two functions below are responsible for updating metadata. * Only metadata on the data provider is updated (we need to update * information about active journal in there). */ static void g_journal_metadata_done(struct bio *bp) { /* * There is not much we can do on error except informing about it. */ if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).", bp->bio_error); } else { GJ_LOGREQ(2, bp, "Metadata updated."); } gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(bp); } static void g_journal_metadata_update(struct g_journal_softc *sc) { struct g_journal_metadata md; struct g_consumer *cp; struct bio *bp; u_char *sector; cp = sc->sc_dconsumer; sector = gj_malloc(cp->provider->sectorsize, M_WAITOK); strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic)); md.md_version = G_JOURNAL_VERSION; md.md_id = sc->sc_id; md.md_type = sc->sc_orig_type; md.md_jstart = sc->sc_jstart; md.md_jend = sc->sc_jend; md.md_joffset = sc->sc_inactive.jj_offset; md.md_jid = sc->sc_journal_previous_id; md.md_flags = 0; if (sc->sc_flags & GJF_DEVICE_CLEAN) md.md_flags |= GJ_FLAG_CLEAN; if (sc->sc_flags & GJF_DEVICE_HARDCODED) strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider)); else bzero(md.md_provider, sizeof(md.md_provider)); md.md_provsize = cp->provider->mediasize; journal_metadata_encode(&md, sector); /* * Flush the cache, so we know all data are on disk. * We write here informations like "journal is consistent", so we need * to be sure it is. Without BIO_FLUSH here, we can end up in situation * where metadata is stored on disk, but not all data. */ g_journal_flush_cache(sc); bp = g_alloc_bio(); bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize; bp->bio_length = cp->provider->sectorsize; bp->bio_data = sector; bp->bio_cmd = BIO_WRITE; if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) { bp->bio_done = g_journal_metadata_done; g_io_request(bp, cp); } else { bp->bio_done = NULL; g_io_request(bp, cp); biowait(bp, "gjmdu"); g_journal_metadata_done(bp); } /* * Be sure metadata reached the disk. */ g_journal_flush_cache(sc); } /* * This is where the I/O request comes from the GEOM. */ static void g_journal_start(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_to->geom->softc; GJ_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_regular_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); return; case BIO_GETATTR: if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) { strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length); bp->bio_completed = strlen(bp->bio_to->name) + 1; g_io_deliver(bp, 0); return; } /* FALLTHROUGH */ case BIO_SPEEDUP: case BIO_DELETE: default: g_io_deliver(bp, EOPNOTSUPP); return; } } static void g_journal_std_done(struct bio *bp) { struct g_journal_softc *sc; sc = bp->bio_from->geom->softc; mtx_lock(&sc->sc_mtx); bioq_insert_tail(&sc->sc_back_queue, bp); wakeup(sc); mtx_unlock(&sc->sc_mtx); } static struct bio * g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data, int flags) { struct bio *bp; bp = g_alloc_bio(); bp->bio_offset = start; bp->bio_joffset = joffset; bp->bio_length = end - start; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; if (data == NULL) bp->bio_data = NULL; else { bp->bio_data = gj_malloc(bp->bio_length, flags); if (bp->bio_data != NULL) bcopy(data, bp->bio_data, bp->bio_length); } return (bp); } #define g_journal_insert_bio(head, bp, flags) \ g_journal_insert((head), (bp)->bio_offset, \ (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \ (bp)->bio_data, flags) /* * The function below does a lot more than just inserting bio to the queue. * It keeps the queue sorted by offset and ensures that there are no doubled * data (it combines bios where ranges overlap). * * The function returns the number of bios inserted (as bio can be splitted). */ static int g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset, u_char *data, int flags) { struct bio *nbp, *cbp, *pbp; off_t cstart, cend; u_char *tmpdata; int n; GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend, joffset); n = 0; pbp = NULL; GJQ_FOREACH(*head, cbp) { cstart = cbp->bio_offset; cend = cbp->bio_offset + cbp->bio_length; if (nstart >= cend) { /* * +-------------+ * | | * | current | +-------------+ * | bio | | | * | | | new | * +-------------+ | bio | * | | * +-------------+ */ GJ_DEBUG(3, "INSERT(%p): 1", *head); } else if (nend <= cstart) { /* * +-------------+ * | | * +-------------+ | current | * | | | bio | * | new | | | * | bio | +-------------+ * | | * +-------------+ */ nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; n++; GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp, pbp); goto end; } else if (nstart <= cstart && nend >= cend) { /* * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+---+ +-------------+---+ * | | | | | | | * | | | | | | | * | +-------------+ | +-------------+ | * | new bio | | new bio | * +---------------------+ +-----------------+ * * +-------------+ +-------------+ * | current bio | | current bio | * +---+-------------+ +-------------+ * | | | | | * | | | | | * | +-------------+ +-------------+ * | new bio | | new bio | * +-----------------+ +-------------+ */ g_journal_stats_bytes_skipped += cbp->bio_length; cbp->bio_offset = nstart; cbp->bio_joffset = joffset; cbp->bio_length = cend - nstart; if (cbp->bio_data != NULL) { gj_free(cbp->bio_data, cend - cstart); cbp->bio_data = NULL; } if (data != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(data, cbp->bio_data, cbp->bio_length); } data += cend - nstart; } joffset += cend - nstart; nstart = cend; GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend >= cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * | +-------------+ | +---------+---+ * | | | | | | | * | | | | | | | * +---+-------------+ +---+---------+ | * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += cend - nstart; nbp = g_journal_new_bio(nstart, cend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } if (data != NULL) data += cend - nstart; joffset += cend - nstart; nstart = cend; n++; GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp); } else if (nstart > cstart && nend < cend) { /* * +---------------------+ * | current bio | * | +-------------+ | * | | | | * | | | | * +---+-------------+---+ * | new bio | * +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); nbp->bio_next = cbp->bio_next; cbp->bio_next = nbp; if (cbp->bio_data == NULL) tmpdata = NULL; else tmpdata = cbp->bio_data + nend - cstart; nbp = g_journal_new_bio(nend, cend, cbp->bio_joffset + nend - cstart, tmpdata, flags); nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next; ((struct bio *)cbp->bio_next)->bio_next = nbp; cbp->bio_length = nstart - cstart; if (cbp->bio_data != NULL) { cbp->bio_data = gj_realloc(cbp->bio_data, cbp->bio_length, cend - cstart); } n += 2; GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp); goto end; } else if (nstart <= cstart && nend < cend) { /* * +-----------------+ +-------------+ * | current bio | | current bio | * +-------------+ | +---+---------+ | * | | | | | | | * | | | | | | | * +-------------+---+ | +---------+---+ * | new bio | | new bio | * +-------------+ +-------------+ */ g_journal_stats_bytes_skipped += nend - nstart; nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = cbp; cbp->bio_offset = nend; cbp->bio_length = cend - nend; cbp->bio_joffset += nend - cstart; tmpdata = cbp->bio_data; if (tmpdata != NULL) { cbp->bio_data = gj_malloc(cbp->bio_length, flags); if (cbp->bio_data != NULL) { bcopy(tmpdata + nend - cstart, cbp->bio_data, cbp->bio_length); } gj_free(tmpdata, cend - cstart); } n++; GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp); goto end; } if (nstart == nend) goto end; pbp = cbp; } nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); if (pbp == NULL) *head = nbp; else pbp->bio_next = nbp; nbp->bio_next = NULL; n++; GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp); end: if (g_journal_debug >= 3) { GJQ_FOREACH(*head, cbp) { GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp, (intmax_t)cbp->bio_offset, (intmax_t)cbp->bio_length, (intmax_t)cbp->bio_joffset, cbp->bio_data); } GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n); } return (n); } /* * The function combines neighbour bios trying to squeeze as much data as * possible into one bio. * * The function returns the number of bios combined (negative value). */ static int g_journal_optimize(struct bio *head) { struct bio *cbp, *pbp; int n; n = 0; pbp = NULL; GJQ_FOREACH(head, cbp) { /* Skip bios which has to be read first. */ if (cbp->bio_data == NULL) { pbp = NULL; continue; } /* There is no previous bio yet. */ if (pbp == NULL) { pbp = cbp; continue; } /* Is this a neighbour bio? */ if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) { /* Be sure that bios queue is sorted. */ KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset, ("poffset=%jd plength=%jd coffset=%jd", (intmax_t)pbp->bio_offset, (intmax_t)pbp->bio_length, (intmax_t)cbp->bio_offset)); pbp = cbp; continue; } /* Be sure we don't end up with too big bio. */ if (pbp->bio_length + cbp->bio_length > maxphys) { pbp = cbp; continue; } /* Ok, we can join bios. */ GJ_LOGREQ(4, pbp, "Join: "); GJ_LOGREQ(4, cbp, "and: "); pbp->bio_data = gj_realloc(pbp->bio_data, pbp->bio_length + cbp->bio_length, pbp->bio_length); bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length, cbp->bio_length); gj_free(cbp->bio_data, cbp->bio_length); pbp->bio_length += cbp->bio_length; pbp->bio_next = cbp->bio_next; g_destroy_bio(cbp); cbp = pbp; g_journal_stats_combined_ios++; n--; GJ_LOGREQ(4, pbp, "Got: "); } return (n); } /* * TODO: Update comment. * These are functions responsible for copying one portion of data from journal * to the destination provider. * The order goes like this: * 1. Read the header, which contains informations about data blocks * following it. * 2. Read the data blocks from the journal. * 3. Write the data blocks on the data provider. * * g_journal_copy_start() * g_journal_copy_done() - got finished write request, logs potential errors. */ /* * When there is no data in cache, this function is used to read it. */ static void g_journal_read_first(struct g_journal_softc *sc, struct bio *bp) { struct bio *cbp; /* * We were short in memory, so data was freed. * In that case we need to read it back from journal. */ cbp = g_alloc_bio(); cbp->bio_cflags = bp->bio_cflags; cbp->bio_parent = bp; cbp->bio_offset = bp->bio_joffset; cbp->bio_length = bp->bio_length; cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK); cbp->bio_cmd = BIO_READ; cbp->bio_done = g_journal_std_done; GJ_LOGREQ(4, cbp, "READ FIRST"); g_io_request(cbp, sc->sc_jconsumer); g_journal_cache_misses++; } static void g_journal_copy_send(struct g_journal_softc *sc) { struct bio *bioq, *bp, *lbp; bioq = lbp = NULL; mtx_lock(&sc->sc_mtx); for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) { bp = GJQ_FIRST(sc->sc_inactive.jj_queue); if (bp == NULL) break; GJQ_REMOVE(sc->sc_inactive.jj_queue, bp); sc->sc_copy_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } mtx_unlock(&sc->sc_mtx); if (g_journal_do_optimize) sc->sc_copy_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJQ_INSERT_HEAD(sc->sc_copy_queue, bp); bp->bio_cflags = GJ_BIO_COPY; if (bp->bio_data == NULL) g_journal_read_first(sc, bp); else { bp->bio_joffset = 0; GJ_LOGREQ(4, bp, "SEND"); g_io_request(bp, sc->sc_dconsumer); } } } static void g_journal_copy_start(struct g_journal_softc *sc) { /* * Remember in metadata that we're starting to copy journaled data * to the data provider. * In case of power failure, we will copy these data once again on boot. */ if (!sc->sc_journal_copying) { sc->sc_journal_copying = 1; GJ_DEBUG(1, "Starting copy of journal."); g_journal_metadata_update(sc); } g_journal_copy_send(sc); } /* * Data block has been read from the journal provider. */ static int g_journal_copy_read_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; pbp = bp->bio_parent; if (bp->bio_error != 0) { GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); /* * We will not be able to deliver WRITE request as well. */ gj_free(bp->bio_data, bp->bio_length); g_destroy_bio(pbp); g_destroy_bio(bp); sc->sc_copy_in_progress--; return (1); } pbp->bio_data = bp->bio_data; cp = sc->sc_dconsumer; g_io_request(pbp, cp); GJ_LOGREQ(4, bp, "READ DONE"); g_destroy_bio(bp); return (0); } /* * Data block has been written to the data provider. */ static void g_journal_copy_write_done(struct bio *bp) { struct g_journal_softc *sc; KASSERT(bp->bio_cflags == GJ_BIO_COPY, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); sc = bp->bio_from->geom->softc; sc->sc_copy_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)", bp->bio_error); } GJQ_REMOVE(sc->sc_copy_queue, bp); gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); if (sc->sc_copy_in_progress == 0) { /* * This was the last write request for this journal. */ GJ_DEBUG(1, "Data has been copied."); sc->sc_journal_copying = 0; } } static void g_journal_flush_done(struct bio *bp); /* * Flush one record onto active journal provider. */ static void g_journal_flush(struct g_journal_softc *sc) { struct g_journal_record_header hdr; struct g_journal_entry *ent; struct g_provider *pp; struct bio **bioq; struct bio *bp, *fbp, *pbp; off_t joffset; u_char *data, hash[16]; MD5_CTX ctx; u_int i; if (sc->sc_current_count == 0) return; pp = sc->sc_jprovider; GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); joffset = sc->sc_journal_offset; GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.", sc->sc_current_count, pp->name, (intmax_t)joffset); /* * Store 'journal id', so we know to which journal this record belongs. */ hdr.jrh_journal_id = sc->sc_journal_id; /* Could be less than g_journal_record_entries if called due timeout. */ hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries); strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic)); bioq = &sc->sc_active.jj_queue; GJQ_LAST(sc->sc_flush_queue, pbp); fbp = g_alloc_bio(); fbp->bio_parent = NULL; fbp->bio_cflags = GJ_BIO_JOURNAL; fbp->bio_offset = -1; fbp->bio_joffset = joffset; fbp->bio_length = pp->sectorsize; fbp->bio_cmd = BIO_WRITE; fbp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp); pbp = fbp; fbp->bio_to = pp; GJ_LOGREQ(4, fbp, "FLUSH_OUT"); joffset += pp->sectorsize; sc->sc_flush_count++; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < hdr.jrh_nentries; i++) { bp = sc->sc_current_queue; KASSERT(bp != NULL, ("NULL bp")); bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSHED"); sc->sc_current_queue = bp->bio_next; bp->bio_next = NULL; sc->sc_current_count--; /* Add to the header. */ ent = &hdr.jrh_entries[i]; ent->je_offset = bp->bio_offset; ent->je_joffset = joffset; ent->je_length = bp->bio_length; data = bp->bio_data; if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Update(&ctx, data, ent->je_length); g_reset_bio(bp); bp->bio_cflags = GJ_BIO_JOURNAL; bp->bio_offset = ent->je_offset; bp->bio_joffset = ent->je_joffset; bp->bio_length = ent->je_length; bp->bio_data = data; bp->bio_cmd = BIO_WRITE; bp->bio_done = g_journal_std_done; GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp); pbp = bp; bp->bio_to = pp; GJ_LOGREQ(4, bp, "FLUSH_OUT"); joffset += bp->bio_length; sc->sc_flush_count++; /* * Add request to the active sc_journal_queue queue. * This is our cache. After journal switch we don't have to * read the data from the inactive journal, because we keep * it in memory. */ g_journal_insert(bioq, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, data, M_NOWAIT); } /* * After all requests, store valid header. */ data = gj_malloc(pp->sectorsize, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(hash, &ctx); bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum)); } g_journal_record_header_encode(&hdr, data); fbp->bio_data = data; sc->sc_journal_offset = joffset; g_journal_check_overflow(sc); } /* * Flush request finished. */ static void g_journal_flush_done(struct bio *bp) { struct g_journal_softc *sc; struct g_consumer *cp; KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL)); cp = bp->bio_from; sc = cp->geom->softc; sc->sc_flush_in_progress--; if (bp->bio_error != 0) { GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)", bp->bio_error); } gj_free(bp->bio_data, bp->bio_length); GJ_LOGREQ(4, bp, "DONE"); g_destroy_bio(bp); } static void g_journal_release_delayed(struct g_journal_softc *sc); static void g_journal_flush_send(struct g_journal_softc *sc) { struct g_consumer *cp; struct bio *bioq, *bp, *lbp; cp = sc->sc_jconsumer; bioq = lbp = NULL; while (sc->sc_flush_in_progress < g_journal_parallel_flushes) { /* Send one flush requests to the active journal. */ bp = GJQ_FIRST(sc->sc_flush_queue); if (bp != NULL) { GJQ_REMOVE(sc->sc_flush_queue, bp); sc->sc_flush_count--; bp->bio_offset = bp->bio_joffset; bp->bio_joffset = 0; sc->sc_flush_in_progress++; GJQ_INSERT_AFTER(bioq, bp, lbp); lbp = bp; } /* Try to release delayed requests. */ g_journal_release_delayed(sc); /* If there are no requests to flush, leave. */ if (GJQ_FIRST(sc->sc_flush_queue) == NULL) break; } if (g_journal_do_optimize) sc->sc_flush_in_progress += g_journal_optimize(bioq); while ((bp = GJQ_FIRST(bioq)) != NULL) { GJQ_REMOVE(bioq, bp); GJ_LOGREQ(3, bp, "Flush request send"); g_io_request(bp, cp); } } static void g_journal_add_current(struct g_journal_softc *sc, struct bio *bp) { int n; GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count); n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK); sc->sc_current_count += n; n = g_journal_optimize(sc->sc_current_queue); sc->sc_current_count += n; /* * For requests which are added to the current queue we deliver * response immediately. */ bp->bio_completed = bp->bio_length; g_io_deliver(bp, 0); if (sc->sc_current_count >= g_journal_record_entries) { /* * Let's flush one record onto active journal provider. */ g_journal_flush(sc); } } static void g_journal_release_delayed(struct g_journal_softc *sc) { struct bio *bp; for (;;) { /* The flush queue is full, exit. */ if (sc->sc_flush_count >= g_journal_accept_immediately) return; bp = bioq_takefirst(&sc->sc_delayed_queue); if (bp == NULL) return; sc->sc_delayed_count--; g_journal_add_current(sc, bp); } } /* * Add I/O request to the current queue. If we have enough requests for one * journal record we flush them onto active journal provider. */ static void g_journal_add_request(struct g_journal_softc *sc, struct bio *bp) { /* * The flush queue is full, we need to delay the request. */ if (sc->sc_delayed_count > 0 || sc->sc_flush_count >= g_journal_accept_immediately) { GJ_LOGREQ(4, bp, "DELAYED"); bioq_insert_tail(&sc->sc_delayed_queue, bp); sc->sc_delayed_count++; return; } KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue), ("DELAYED queue not empty.")); g_journal_add_current(sc, bp); } static void g_journal_read_done(struct bio *bp); /* * Try to find requested data in cache. */ static struct bio * g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart, off_t oend) { off_t cstart, cend; struct bio *bp; GJQ_FOREACH(head, bp) { if (bp->bio_offset == -1) continue; cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); if (cend <= ostart) continue; else if (cstart >= oend) { if (!sorted) continue; else { bp = NULL; break; } } if (bp->bio_data == NULL) break; GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, bp); bcopy(bp->bio_data + cstart - bp->bio_offset, pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); pbp->bio_completed += cend - cstart; if (pbp->bio_completed == pbp->bio_length) { /* * Cool, the whole request was in cache, deliver happy * message. */ g_io_deliver(pbp, 0); return (pbp); } break; } return (bp); } /* * This function is used for collecting data on read. * The complexity is because parts of the data can be stored in four different * places: * - in memory - the data not yet send to the active journal provider * - in the active journal * - in the inactive journal * - in the data provider */ static void g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart, off_t oend) { struct bio *bp, *nbp, *head; off_t cstart, cend; u_int i, sorted = 0; GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend); cstart = cend = -1; bp = NULL; head = NULL; for (i = 1; i <= 5; i++) { switch (i) { case 1: /* Not-yet-send data. */ head = sc->sc_current_queue; sorted = 1; break; case 2: /* Skip flush queue as they are also in active queue */ continue; case 3: /* Active journal. */ head = sc->sc_active.jj_queue; sorted = 1; break; case 4: /* Inactive journal. */ /* * XXX: Here could be a race with g_journal_lowmem(). */ head = sc->sc_inactive.jj_queue; sorted = 1; break; case 5: /* In-flight to the data provider. */ head = sc->sc_copy_queue; sorted = 0; break; default: panic("gjournal %s: i=%d", __func__, i); } bp = g_journal_read_find(head, sorted, pbp, ostart, oend); if (bp == pbp) { /* Got the whole request. */ GJ_DEBUG(2, "Got the whole request from %u.", i); return; } else if (bp != NULL) { cstart = MAX(ostart, bp->bio_offset); cend = MIN(oend, bp->bio_offset + bp->bio_length); GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).", i, (intmax_t)cstart, (intmax_t)cend); break; } } if (bp != NULL) { if (bp->bio_data == NULL) { nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + cstart - pbp->bio_offset; nbp->bio_offset = bp->bio_joffset + cstart - bp->bio_offset; nbp->bio_length = cend - cstart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_jconsumer); } /* * If we don't have the whole request yet, call g_journal_read() * recursively. */ if (ostart < cstart) g_journal_read(sc, pbp, ostart, cstart); if (oend > cend) g_journal_read(sc, pbp, cend, oend); } else { /* * No data in memory, no data in journal. * Its time for asking data provider. */ GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend); nbp = g_duplicate_bio(pbp); nbp->bio_cflags = GJ_BIO_READ; nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset; nbp->bio_offset = ostart; nbp->bio_length = oend - ostart; nbp->bio_done = g_journal_read_done; g_io_request(nbp, sc->sc_dconsumer); /* We have the whole request, return here. */ return; } } /* * Function responsible for handling finished READ requests. * Actually, g_std_done() could be used here, the only difference is that we * log error. */ static void g_journal_read_done(struct bio *bp) { struct bio *pbp; KASSERT(bp->bio_cflags == GJ_BIO_READ, ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ)); pbp = bp->bio_parent; pbp->bio_inbed++; pbp->bio_completed += bp->bio_length; if (bp->bio_error != 0) { if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; GJ_DEBUG(0, "Error while reading data from %s (error=%d).", bp->bio_to->name, bp->bio_error); } g_destroy_bio(bp); if (pbp->bio_children == pbp->bio_inbed && pbp->bio_completed == pbp->bio_length) { /* We're done. */ g_io_deliver(pbp, 0); } } /* * Deactive current journal and active next one. */ static void g_journal_switch(struct g_journal_softc *sc) { struct g_provider *pp; if (JEMPTY(sc)) { GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); pp = LIST_FIRST(&sc->sc_geom->provider); if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) { sc->sc_flags |= GJF_DEVICE_CLEAN; GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); } } else { GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name); pp = sc->sc_jprovider; sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_id = sc->sc_journal_next_id; sc->sc_journal_next_id = arc4random(); GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); g_journal_write_header(sc); sc->sc_inactive.jj_offset = sc->sc_active.jj_offset; sc->sc_inactive.jj_queue = sc->sc_active.jj_queue; sc->sc_active.jj_offset = sc->sc_journal_offset - pp->sectorsize; sc->sc_active.jj_queue = NULL; /* * Switch is done, start copying data from the (now) inactive * journal to the data provider. */ g_journal_copy_start(sc); } mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_SWITCH; mtx_unlock(&sc->sc_mtx); } static void g_journal_initialize(struct g_journal_softc *sc) { sc->sc_journal_id = arc4random(); sc->sc_journal_next_id = arc4random(); sc->sc_journal_previous_id = sc->sc_journal_id; sc->sc_journal_offset = sc->sc_jstart; sc->sc_inactive.jj_offset = sc->sc_jstart; g_journal_write_header(sc); sc->sc_active.jj_offset = sc->sc_jstart; } static void g_journal_mark_as_dirty(struct g_journal_softc *sc) { const struct g_journal_desc *desc; int i; GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name); for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++) desc->jd_dirty(sc->sc_dconsumer); } /* * Function read record header from the given journal. * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio * and data on every call. */ static int g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset, void *data) { int error; g_reset_bio(bp); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = cp->provider->sectorsize; bp->bio_data = data; g_io_request(bp, cp); error = biowait(bp, "gjs_read"); return (error); } #if 0 /* * Function is called when we start the journal device and we detect that * one of the journals was not fully copied. * The purpose of this function is to read all records headers from journal * and placed them in the inactive queue, so we can start journal * synchronization process and the journal provider itself. * Design decision was taken to not synchronize the whole journal here as it * can take too much time. Reading headers only and delaying synchronization * process until after journal provider is started should be the best choice. */ #endif static void g_journal_sync(struct g_journal_softc *sc) { struct g_journal_record_header rhdr; struct g_journal_entry *ent; struct g_journal_header jhdr; struct g_consumer *cp; struct bio *bp, *fbp, *tbp; off_t joffset, offset; u_char *buf, sum[16]; uint64_t id; MD5_CTX ctx; int error, found, i; found = 0; fbp = NULL; cp = sc->sc_jconsumer; bp = g_alloc_bio(); buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset; GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset); /* * Read and decode first journal header. */ error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { GJ_DEBUG(0, "Error while reading journal header from %s.", cp->provider->name); goto end; } error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(0, "Cannot decode journal header from %s.", cp->provider->name); goto end; } id = sc->sc_journal_id; if (jhdr.jh_journal_id != sc->sc_journal_id) { GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); goto end; } offset += cp->provider->sectorsize; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; for (;;) { /* * If the biggest record won't fit, look for a record header or * journal header from the beginning. */ GJ_VALIDATE_OFFSET(offset, sc); error = g_journal_sync_read(cp, bp, offset, buf); if (error != 0) { /* * Not good. Having an error while reading header * means, that we cannot read next headers and in * consequence we cannot find termination. */ GJ_DEBUG(0, "Error while reading record header from %s.", cp->provider->name); break; } error = g_journal_record_header_decode(buf, &rhdr); if (error != 0) { GJ_DEBUG(2, "Not a record header at %jd (error=%d).", (intmax_t)offset, error); /* * This is not a record header. * If we are lucky, this is next journal header. */ error = g_journal_header_decode(buf, &jhdr); if (error != 0) { GJ_DEBUG(1, "Not a journal header at %jd (error=%d).", (intmax_t)offset, error); /* * Nope, this is not journal header, which * bascially means that journal is not * terminated properly. */ error = ENOENT; break; } /* * Ok. This is header of _some_ journal. Now we need to * verify if this is header of the _next_ journal. */ if (jhdr.jh_journal_id != id) { GJ_DEBUG(1, "Journal ID mismatch at %jd " "(0x%08x != 0x%08x).", (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); error = ENOENT; break; } /* Found termination. */ found++; GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).", (intmax_t)offset, (u_int)id); sc->sc_active.jj_offset = offset; sc->sc_journal_offset = offset + cp->provider->sectorsize; sc->sc_journal_id = id; id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; GJ_LOGREQ(3, tbp, "Adding request."); g_journal_insert_bio(&sc->sc_inactive.jj_queue, tbp, M_WAITOK); } /* Skip journal's header. */ offset += cp->provider->sectorsize; continue; } /* Skip record's header. */ offset += cp->provider->sectorsize; /* * Add information about every record entry to the inactive * queue. */ if (sc->sc_flags & GJF_DEVICE_CHECKSUM) MD5Init(&ctx); for (i = 0; i < rhdr.jrh_nentries; i++) { ent = &rhdr.jrh_entries[i]; GJ_DEBUG(3, "Insert entry: %jd %jd.", (intmax_t)ent->je_offset, (intmax_t)ent->je_length); g_journal_insert(&fbp, ent->je_offset, ent->je_offset + ent->je_length, ent->je_joffset, NULL, M_WAITOK); if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { u_char *buf2; /* * TODO: Should use faster function (like * g_journal_sync_read()). */ buf2 = g_read_data(cp, offset, ent->je_length, NULL); if (buf2 == NULL) GJ_DEBUG(0, "Cannot read data at %jd.", (intmax_t)offset); else { MD5Update(&ctx, buf2, ent->je_length); g_free(buf2); } } /* Skip entry's data. */ offset += ent->je_length; } if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { MD5Final(sum, &ctx); if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) { GJ_DEBUG(0, "MD5 hash mismatch at %jd!", (intmax_t)offset); } } } end: gj_free(bp->bio_data, cp->provider->sectorsize); g_destroy_bio(bp); /* Remove bios from unterminated journal. */ while ((tbp = fbp) != NULL) { fbp = tbp->bio_next; g_destroy_bio(tbp); } if (found < 1 && joffset > 0) { GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.", sc->sc_name); while ((tbp = sc->sc_inactive.jj_queue) != NULL) { sc->sc_inactive.jj_queue = tbp->bio_next; g_destroy_bio(tbp); } g_journal_initialize(sc); g_journal_mark_as_dirty(sc); } else { GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name); g_journal_copy_start(sc); } } /* * Wait for requests. * If we have requests in the current queue, flush them after 3 seconds from the * last flush. In this way we don't wait forever (or for journal switch) with * storing not full records on journal. */ static void g_journal_wait(struct g_journal_softc *sc, time_t last_write) { int error, timeout; GJ_DEBUG(3, "%s: enter", __func__); if (sc->sc_current_count == 0) { if (g_journal_debug < 2) msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0); else { /* * If we have debug turned on, show number of elements * in various queues. */ for (;;) { error = msleep(sc, &sc->sc_mtx, PRIBIO, "gj:work", hz * 3); if (error == 0) { mtx_unlock(&sc->sc_mtx); break; } GJ_DEBUG(3, "Report: current count=%d", sc->sc_current_count); GJ_DEBUG(3, "Report: flush count=%d", sc->sc_flush_count); GJ_DEBUG(3, "Report: flush in progress=%d", sc->sc_flush_in_progress); GJ_DEBUG(3, "Report: copy in progress=%d", sc->sc_copy_in_progress); GJ_DEBUG(3, "Report: delayed=%d", sc->sc_delayed_count); } } GJ_DEBUG(3, "%s: exit 1", __func__); return; } /* * Flush even not full records every 3 seconds. */ timeout = (last_write + 3 - time_second) * hz; if (timeout <= 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 2", __func__); return; } error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout); if (error == EWOULDBLOCK) g_journal_flush_send(sc); GJ_DEBUG(3, "%s: exit 3", __func__); } /* * Worker thread. */ static void g_journal_worker(void *arg) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; struct bio *bp; time_t last_write; int type; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sc = arg; type = 0; /* gcc */ if (sc->sc_flags & GJF_DEVICE_CLEAN) { GJ_DEBUG(0, "Journal %s clean.", sc->sc_name); g_journal_initialize(sc); } else { g_journal_sync(sc); } /* * Check if we can use BIO_FLUSH. */ sc->sc_bio_flush = 0; if (g_io_flush(sc->sc_jconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_JOURNAL; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_jconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_jconsumer->provider->name); } if (sc->sc_jconsumer != sc->sc_dconsumer) { if (g_io_flush(sc->sc_dconsumer) == 0) { sc->sc_bio_flush |= GJ_FLUSH_DATA; GJ_DEBUG(1, "BIO_FLUSH supported by %s.", sc->sc_dconsumer->provider->name); } else { GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", sc->sc_dconsumer->provider->name); } } gp = sc->sc_geom; g_topology_lock(); pp = g_new_providerf(gp, "%s.journal", sc->sc_name); pp->mediasize = sc->sc_mediasize; /* * There could be a problem when data provider and journal providers * have different sectorsize, but such scenario is prevented on journal * creation. */ pp->sectorsize = sc->sc_sectorsize; g_error_provider(pp, 0); g_topology_unlock(); last_write = time_second; if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } for (;;) { /* Get first request from the queue. */ mtx_lock(&sc->sc_mtx); bp = bioq_first(&sc->sc_back_queue); if (bp != NULL) type = (bp->bio_cflags & GJ_BIO_MASK); if (bp == NULL) { bp = bioq_first(&sc->sc_regular_queue); if (bp != NULL) type = GJ_BIO_REGULAR; } if (bp == NULL) { try_switch: if ((sc->sc_flags & GJF_DEVICE_SWITCH) || (sc->sc_flags & GJF_DEVICE_DESTROY)) { if (sc->sc_current_count > 0) { mtx_unlock(&sc->sc_mtx); g_journal_flush(sc); g_journal_flush_send(sc); continue; } if (sc->sc_flush_in_progress > 0) goto sleep; if (sc->sc_copy_in_progress > 0) goto sleep; } if (sc->sc_flags & GJF_DEVICE_SWITCH) { mtx_unlock(&sc->sc_mtx); g_journal_switch(sc); wakeup(&sc->sc_journal_copying); continue; } if (sc->sc_flags & GJF_DEVICE_DESTROY) { GJ_DEBUG(1, "Shutting down worker " "thread for %s.", gp->name); sc->sc_worker = NULL; wakeup(&sc->sc_worker); mtx_unlock(&sc->sc_mtx); kproc_exit(0); } sleep: g_journal_wait(sc, last_write); continue; } /* * If we're in switch process, we need to delay all new * write requests until its done. */ if ((sc->sc_flags & GJF_DEVICE_SWITCH) && type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) { GJ_LOGREQ(2, bp, "WRITE on SWITCH"); goto try_switch; } if (type == GJ_BIO_REGULAR) bioq_remove(&sc->sc_regular_queue, bp); else bioq_remove(&sc->sc_back_queue, bp); mtx_unlock(&sc->sc_mtx); switch (type) { case GJ_BIO_REGULAR: /* Regular request. */ switch (bp->bio_cmd) { case BIO_READ: g_journal_read(sc, bp, bp->bio_offset, bp->bio_offset + bp->bio_length); break; case BIO_WRITE: last_write = time_second; g_journal_add_request(sc, bp); g_journal_flush_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_COPY: switch (bp->bio_cmd) { case BIO_READ: if (g_journal_copy_read_done(bp)) g_journal_copy_send(sc); break; case BIO_WRITE: g_journal_copy_write_done(bp); g_journal_copy_send(sc); break; default: panic("Invalid bio_cmd (%d).", bp->bio_cmd); } break; case GJ_BIO_JOURNAL: g_journal_flush_done(bp); g_journal_flush_send(sc); break; case GJ_BIO_READ: default: panic("Invalid bio (%d).", type); } } } static void g_journal_destroy_event(void *arg, int flags __unused) { struct g_journal_softc *sc; g_topology_assert(); sc = arg; g_journal_destroy(sc); } static void g_journal_timeout(void *arg) { struct g_journal_softc *sc; sc = arg; GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.", sc->sc_geom->name); g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL); } static struct g_geom * g_journal_create(struct g_class *mp, struct g_provider *pp, const struct g_journal_metadata *md) { struct g_journal_softc *sc; struct g_geom *gp; struct g_consumer *cp; int error; sc = NULL; /* gcc */ g_topology_assert(); /* * There are two possibilities: * 1. Data and both journals are on the same provider. * 2. Data and journals are all on separated providers. */ /* Look for journal device with the same ID. */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_id == md->md_id) break; } if (gp == NULL) sc = NULL; else if (sc != NULL && (sc->sc_type & md->md_type) != 0) { GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id); return (NULL); } if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) { GJ_DEBUG(0, "Invalid type on %s.", pp->name); return (NULL); } if (md->md_type & GJ_TYPE_DATA) { GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id, pp->name); } if (md->md_type & GJ_TYPE_JOURNAL) { GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id, pp->name); } if (sc == NULL) { /* Action geom. */ sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO); sc->sc_id = md->md_id; sc->sc_type = 0; sc->sc_flags = 0; sc->sc_worker = NULL; gp = g_new_geomf(mp, "gjournal %u", sc->sc_id); gp->start = g_journal_start; gp->orphan = g_journal_orphan; gp->access = g_journal_access; gp->softc = sc; gp->flags |= G_GEOM_VOLATILE_BIO; sc->sc_geom = gp; mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF); bioq_init(&sc->sc_back_queue); bioq_init(&sc->sc_regular_queue); bioq_init(&sc->sc_delayed_queue); sc->sc_delayed_count = 0; sc->sc_current_queue = NULL; sc->sc_current_count = 0; sc->sc_flush_queue = NULL; sc->sc_flush_count = 0; sc->sc_flush_in_progress = 0; sc->sc_copy_queue = NULL; sc->sc_copy_in_progress = 0; sc->sc_inactive.jj_queue = NULL; sc->sc_active.jj_queue = NULL; sc->sc_rootmount = root_mount_hold("GJOURNAL"); GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); callout_init(&sc->sc_callout, 1); if (md->md_type != GJ_TYPE_COMPLETE) { /* * Journal and data are on separate providers. * At this point we have only one of them. * We setup a timeout in case the other part will not * appear, so we won't wait forever. */ callout_reset(&sc->sc_callout, 5 * hz, g_journal_timeout, sc); } } /* Remember type of the data provider. */ if (md->md_type & GJ_TYPE_DATA) sc->sc_orig_type = md->md_type; sc->sc_type |= md->md_type; cp = NULL; if (md->md_type & GJ_TYPE_DATA) { if (md->md_flags & GJ_FLAG_CLEAN) sc->sc_flags |= GJF_DEVICE_CLEAN; if (md->md_flags & GJ_FLAG_CHECKSUM) sc->sc_flags |= GJF_DEVICE_CHECKSUM; cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } sc->sc_dconsumer = cp; sc->sc_mediasize = pp->mediasize - pp->sectorsize; sc->sc_sectorsize = pp->sectorsize; sc->sc_jstart = md->md_jstart; sc->sc_jend = md->md_jend; if (md->md_provider[0] != '\0') sc->sc_flags |= GJF_DEVICE_HARDCODED; sc->sc_journal_offset = md->md_joffset; sc->sc_journal_id = md->md_jid; sc->sc_journal_previous_id = md->md_jid; } if (md->md_type & GJ_TYPE_JOURNAL) { if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); KASSERT(error == 0, ("Cannot attach to %s (error=%d).", pp->name, error)); error = g_access(cp, 1, 1, 1); if (error != 0) { GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, error); g_journal_destroy(sc); return (NULL); } } else { /* * Journal is on the same provider as data, which means * that data provider ends where journal starts. */ sc->sc_mediasize = md->md_jstart; } sc->sc_jconsumer = cp; } /* Start switcher kproc if needed. */ if (g_journal_switcher_proc == NULL) g_journal_start_switcher(mp); if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) { /* Journal is not complete yet. */ return (gp); } else { /* Journal complete, cancel timeout. */ callout_drain(&sc->sc_callout); } error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0, "g_journal %s", sc->sc_name); if (error != 0) { GJ_DEBUG(0, "Cannot create worker thread for %s.journal.", sc->sc_name); g_journal_destroy(sc); return (NULL); } return (gp); } static void g_journal_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; g_detach(cp); g_destroy_consumer(cp); } static int g_journal_destroy(struct g_journal_softc *sc) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (sc == NULL) return (ENXIO); gp = sc->sc_geom; pp = LIST_FIRST(&gp->provider); if (pp != NULL) { if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } g_error_provider(pp, ENXIO); g_journal_flush(sc); g_journal_flush_send(sc); g_journal_switch(sc); } sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN); g_topology_unlock(); if (sc->sc_rootmount != NULL) { GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); root_mount_rel(sc->sc_rootmount); sc->sc_rootmount = NULL; } callout_drain(&sc->sc_callout); mtx_lock(&sc->sc_mtx); wakeup(sc); while (sc->sc_worker != NULL) msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0); mtx_unlock(&sc->sc_mtx); if (pp != NULL) { GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); g_journal_metadata_update(sc); g_topology_lock(); g_wither_provider(pp, ENXIO); } else { g_topology_lock(); } mtx_destroy(&sc->sc_mtx); if (sc->sc_current_count != 0) { GJ_DEBUG(0, "Warning! Number of current requests %d.", sc->sc_current_count); } gp->softc = NULL; LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->acr + cp->acw + cp->ace > 0) g_access(cp, -1, -1, -1); /* * We keep all consumers open for writing, so if I'll detach * and destroy consumer here, I'll get providers for taste, so * journal will be started again. * Sending an event here, prevents this from happening. */ g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL); } g_wither_geom(gp, ENXIO); free(sc, M_JOURNAL); return (0); } static void g_journal_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_journal_metadata md; struct g_consumer *cp; struct g_geom *gp; int error; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); GJ_DEBUG(2, "Tasting %s.", pp->name); if (pp->geom->class == mp) return (NULL); gp = g_new_geomf(mp, "journal:taste"); /* This orphan function should be never called. */ gp->orphan = g_journal_taste_orphan; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) { error = g_journal_metadata_read(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) return (NULL); if (g_journal_debug >= 2) journal_metadata_dump(&md); gp = g_journal_create(mp, pp, &md); return (gp); } static struct g_journal_softc * g_journal_find_device(struct g_class *mp, const char *name) { struct g_journal_softc *sc; struct g_geom *gp; struct g_provider *pp; if (strncmp(name, _PATH_DEV, 5) == 0) name += 5; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; pp = LIST_FIRST(&gp->provider); if (strcmp(sc->sc_name, name) == 0) return (sc); if (pp != NULL && strcmp(pp->name, name) == 0) return (sc); } return (NULL); } static void g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_journal_softc *sc; const char *name; char param[16]; int *nargs; int error, i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } for (i = 0; i < *nargs; i++) { snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument.", i); return; } sc = g_journal_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_journal_destroy(sc); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", LIST_FIRST(&sc->sc_geom->provider)->name, error); return; } } } static void g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused) { g_topology_assert(); g_topology_unlock(); g_journal_sync_requested++; wakeup(&g_journal_switcher_state); while (g_journal_sync_requested > 0) tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2); g_topology_lock(); } static void g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_JOURNAL_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_journal_ctl_destroy(req, mp); return; } else if (strcmp(verb, "sync") == 0) { g_journal_ctl_sync(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_journal_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { int first = 1; sbuf_printf(sb, "%s", indent); if (cp == sc->sc_dconsumer) { sbuf_cat(sb, "Data"); first = 0; } if (cp == sc->sc_jconsumer) { if (!first) sbuf_cat(sb, ","); sbuf_cat(sb, "Journal"); } sbuf_cat(sb, "\n"); if (cp == sc->sc_jconsumer) { sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jstart); sbuf_printf(sb, "%jd\n", (intmax_t)sc->sc_jend); } } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); } } static eventhandler_tag g_journal_event_shutdown = NULL; static eventhandler_tag g_journal_event_lowmem = NULL; static void g_journal_shutdown(void *arg, int howto __unused) { struct g_class *mp; struct g_geom *gp, *gp2; if (KERNEL_PANICKED()) return; mp = arg; g_topology_lock(); LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if (gp->softc == NULL) continue; GJ_DEBUG(0, "Shutting down geom %s.", gp->name); g_journal_destroy(gp->softc); } g_topology_unlock(); } /* * Free cached requests from inactive queue in case of low memory. * We free GJ_FREE_AT_ONCE elements at once. */ #define GJ_FREE_AT_ONCE 4 static void g_journal_lowmem(void *arg, int howto __unused) { struct g_journal_softc *sc; struct g_class *mp; struct g_geom *gp; struct bio *bp; u_int nfree = GJ_FREE_AT_ONCE; g_journal_stats_low_mem++; mp = arg; g_topology_lock(); LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) continue; mtx_lock(&sc->sc_mtx); for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL; nfree--, bp = bp->bio_next) { /* * This is safe to free the bio_data, because: * 1. If bio_data is NULL it will be read from the * inactive journal. * 2. If bp is sent down, it is first removed from the * inactive queue, so it's impossible to free the * data from under in-flight bio. * On the other hand, freeing elements from the active * queue, is not safe. */ if (bp->bio_data != NULL) { GJ_DEBUG(2, "Freeing data from %s.", sc->sc_name); gj_free(bp->bio_data, bp->bio_length); bp->bio_data = NULL; } } mtx_unlock(&sc->sc_mtx); if (nfree == 0) break; } g_topology_unlock(); } static void g_journal_switcher(void *arg); static void g_journal_init(struct g_class *mp) { /* Pick a conservative value if provided value sucks. */ if (g_journal_cache_divisor <= 0 || (vm_kmem_size / g_journal_cache_divisor == 0)) { g_journal_cache_divisor = 5; } if (g_journal_cache_limit > 0) { g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor; g_journal_cache_low = (g_journal_cache_limit / 100) * g_journal_cache_switch; } g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync, g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_shutdown == NULL) GJ_DEBUG(0, "Warning! Cannot register shutdown event."); g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST); if (g_journal_event_lowmem == NULL) GJ_DEBUG(0, "Warning! Cannot register lowmem event."); } static void g_journal_fini(struct g_class *mp) { if (g_journal_event_shutdown != NULL) { EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_journal_event_shutdown); } if (g_journal_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem); if (g_journal_switcher_proc != NULL) g_journal_stop_switcher(); } DECLARE_GEOM_CLASS(g_journal_class, g_journal); static const struct g_journal_desc * g_journal_find_desc(const char *fstype) { const struct g_journal_desc *desc; int i; for (desc = g_journal_filesystems[i = 0]; desc != NULL; desc = g_journal_filesystems[++i]) { if (strcmp(desc->jd_fstype, fstype) == 0) break; } return (desc); } static void g_journal_switch_wait(struct g_journal_softc *sc) { struct bintime bt; mtx_assert(&sc->sc_mtx, MA_OWNED); if (g_journal_debug >= 2) { if (sc->sc_flush_in_progress > 0) { GJ_DEBUG(2, "%d requests flushing.", sc->sc_flush_in_progress); } if (sc->sc_copy_in_progress > 0) { GJ_DEBUG(2, "%d requests copying.", sc->sc_copy_in_progress); } if (sc->sc_flush_count > 0) { GJ_DEBUG(2, "%d requests to flush.", sc->sc_flush_count); } if (sc->sc_delayed_count > 0) { GJ_DEBUG(2, "%d requests delayed.", sc->sc_delayed_count); } } g_journal_stats_switches++; if (sc->sc_copy_in_progress > 0) g_journal_stats_wait_for_copy++; GJ_TIMER_START(1, &bt); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; sc->sc_flags |= GJF_DEVICE_SWITCH; wakeup(sc); while (sc->sc_flags & GJF_DEVICE_SWITCH) { msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO, "gj:switch", 0); } GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name); } static void g_journal_do_switch(struct g_class *classp) { struct g_journal_softc *sc; const struct g_journal_desc *desc; struct g_geom *gp; struct mount *mp; struct bintime bt; char *mountpoint; int error, save; g_topology_lock(); LIST_FOREACH(gp, &classp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_flags & GJF_DEVICE_DESTROY) continue; if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) continue; mtx_lock(&sc->sc_mtx); sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); } g_topology_unlock(); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_gjprovider == NULL) continue; if (mp->mnt_flag & MNT_RDONLY) continue; desc = g_journal_find_desc(mp->mnt_stat.f_fstypename); if (desc == NULL) continue; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */ g_topology_lock(); sc = g_journal_find_device(classp, mp->mnt_gjprovider); g_topology_unlock(); if (sc == NULL) { GJ_DEBUG(0, "Cannot find journal geom for %s.", mp->mnt_gjprovider); goto next; } else if (JEMPTY(sc)) { mtx_lock(&sc->sc_mtx); sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; mtx_unlock(&sc->sc_mtx); GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); goto next; } mountpoint = mp->mnt_stat.f_mntonname; error = vn_start_write(NULL, &mp, V_WAIT); if (error != 0) { GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).", mountpoint, error); goto next; } save = curthread_pflags_set(TDP_SYNCIO); GJ_TIMER_START(1, &bt); vfs_periodic(mp, MNT_NOWAIT); GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint); GJ_TIMER_START(1, &bt); error = VFS_SYNC(mp, MNT_NOWAIT); if (error == 0) GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint); else { GJ_DEBUG(0, "Cannot sync file system %s (error=%d).", mountpoint, error); } curthread_pflags_restore(save); vn_finished_write(mp); if (error != 0) goto next; /* * Send BIO_FLUSH before freezing the file system, so it can be * faster after the freeze. */ GJ_TIMER_START(1, &bt); g_journal_flush_cache(sc); GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name); GJ_TIMER_START(1, &bt); error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT); GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint); if (error != 0) { GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).", mountpoint, error); goto next; } error = desc->jd_clean(mp); if (error != 0) goto next; mtx_lock(&sc->sc_mtx); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); vfs_write_resume(mp, 0); next: mtx_lock(&mountlist_mtx); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); sc = NULL; for (;;) { g_topology_lock(); LIST_FOREACH(gp, &g_journal_class.geom, geom) { sc = gp->softc; if (sc == NULL) continue; mtx_lock(&sc->sc_mtx); if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE && !(sc->sc_flags & GJF_DEVICE_DESTROY) && (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) { break; } mtx_unlock(&sc->sc_mtx); sc = NULL; } g_topology_unlock(); if (sc == NULL) break; mtx_assert(&sc->sc_mtx, MA_OWNED); g_journal_switch_wait(sc); mtx_unlock(&sc->sc_mtx); } } static void g_journal_start_switcher(struct g_class *mp) { - int error; + int error __diagused; g_topology_assert(); MPASS(g_journal_switcher_proc == NULL); g_journal_switcher_state = GJ_SWITCHER_WORKING; error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc, 0, 0, "g_journal switcher"); KASSERT(error == 0, ("Cannot create switcher thread.")); } static void g_journal_stop_switcher(void) { g_topology_assert(); MPASS(g_journal_switcher_proc != NULL); g_journal_switcher_state = GJ_SWITCHER_DIE; wakeup(&g_journal_switcher_state); while (g_journal_switcher_state != GJ_SWITCHER_DIED) tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5); GJ_DEBUG(1, "Switcher died."); g_journal_switcher_proc = NULL; } /* * TODO: Kill switcher thread on last geom destruction? */ static void g_journal_switcher(void *arg) { struct g_class *mp; struct bintime bt; int error; mp = arg; curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { g_journal_switcher_wokenup = 0; error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait", g_journal_switch_time * hz); if (g_journal_switcher_state == GJ_SWITCHER_DIE) { g_journal_switcher_state = GJ_SWITCHER_DIED; GJ_DEBUG(1, "Switcher exiting."); wakeup(&g_journal_switcher_state); kproc_exit(0); } if (error == 0 && g_journal_sync_requested == 0) { GJ_DEBUG(1, "Out of cache, force switch (used=%jd " "limit=%jd).", (intmax_t)g_journal_cache_used, (intmax_t)g_journal_cache_limit); } GJ_TIMER_START(1, &bt); g_journal_do_switch(mp); GJ_TIMER_STOP(1, &bt, "Entire switch time"); if (g_journal_sync_requested > 0) { g_journal_sync_requested = 0; wakeup(&g_journal_sync_requested); } } } diff --git a/sys/geom/shsec/g_shsec.c b/sys/geom/shsec/g_shsec.c index a3b2f59d0555..65bfbc6681dd 100644 --- a/sys/geom/shsec/g_shsec.c +++ b/sys/geom/shsec/g_shsec.c @@ -1,840 +1,838 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_shsec, "GEOM shared secret device support"); static MALLOC_DEFINE(M_SHSEC, "shsec_data", "GEOM_SHSEC Data"); static uma_zone_t g_shsec_zone; static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force); static int g_shsec_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_shsec_taste; static g_ctl_req_t g_shsec_config; static g_dumpconf_t g_shsec_dumpconf; static g_init_t g_shsec_init; static g_fini_t g_shsec_fini; struct g_class g_shsec_class = { .name = G_SHSEC_CLASS_NAME, .version = G_VERSION, .ctlreq = g_shsec_config, .taste = g_shsec_taste, .destroy_geom = g_shsec_destroy_geom, .init = g_shsec_init, .fini = g_shsec_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, shsec, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_SHSEC stuff"); static u_int g_shsec_debug; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, debug, CTLFLAG_RWTUN, &g_shsec_debug, 0, "Debug level"); static u_long g_shsec_maxmem; SYSCTL_ULONG(_kern_geom_shsec, OID_AUTO, maxmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &g_shsec_maxmem, 0, "Maximum memory that can be allocated for I/O (in bytes)"); static u_int g_shsec_alloc_failed = 0; SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, alloc_failed, CTLFLAG_RD, &g_shsec_alloc_failed, 0, "How many times I/O allocation failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_shsec_init(struct g_class *mp __unused) { g_shsec_maxmem = maxphys * 100; TUNABLE_ULONG_FETCH("kern.geom.shsec.maxmem,", &g_shsec_maxmem); g_shsec_zone = uma_zcreate("g_shsec_zone", maxphys, NULL, NULL, NULL, NULL, 0, 0); g_shsec_maxmem -= g_shsec_maxmem % maxphys; uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / maxphys); } static void g_shsec_fini(struct g_class *mp __unused) { uma_zdestroy(g_shsec_zone); } /* * Return the number of valid disks. */ static u_int g_shsec_nvalid(struct g_shsec_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_shsec_remove_disk(struct g_consumer *cp) { struct g_shsec_softc *sc; u_int no; KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_shsec_softc *)cp->private; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); no = cp->index; G_SHSEC_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); sc->sc_disks[no] = NULL; if (sc->sc_provider != NULL) { g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s removed.", sc->sc_name); } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; g_detach(cp); g_destroy_consumer(cp); } static void g_shsec_orphan(struct g_consumer *cp) { struct g_shsec_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_shsec_remove_disk(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&gp->consumer)) g_shsec_destroy(sc, 1); } static int g_shsec_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; struct g_shsec_softc *sc; struct g_geom *gp; int error; gp = pp->geom; sc = gp->softc; /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; error = ENXIO; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && cp1->flags & G_CF_ORPHAN) { g_detach(cp1); g_destroy_consumer(cp1); } } /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&gp->consumer)) g_shsec_destroy(sc, 1); return (error); fail: /* If we fail here, backout all previous changes. */ LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_shsec_xor1(uint32_t *src, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) *dst = *dst ^ *src++; KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_done(struct bio *bp) { - struct g_shsec_softc *sc; struct bio *pbp; pbp = bp->bio_parent; - sc = pbp->bio_to->geom->softc; if (bp->bio_error == 0) G_SHSEC_LOGREQ(2, bp, "Request done."); else { G_SHSEC_LOGREQ(0, bp, "Request failed (error=%d).", bp->bio_error); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; } if (pbp->bio_cmd == BIO_READ) { if ((pbp->bio_pflags & G_SHSEC_BFLAG_FIRST) != 0) { bcopy(bp->bio_data, pbp->bio_data, pbp->bio_length); pbp->bio_pflags = 0; } else { g_shsec_xor1((uint32_t *)bp->bio_data, (uint32_t *)pbp->bio_data, (ssize_t)pbp->bio_length); } } explicit_bzero(bp->bio_data, bp->bio_length); uma_zfree(g_shsec_zone, bp->bio_data); g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { pbp->bio_completed = pbp->bio_length; g_io_deliver(pbp, pbp->bio_error); } } static void g_shsec_xor2(uint32_t *rand, uint32_t *dst, ssize_t len) { for (; len > 0; len -= sizeof(uint32_t), dst++) { *rand = arc4random(); *dst = *dst ^ *rand++; } KASSERT(len == 0, ("len != 0 (len=%zd)", len)); } static void g_shsec_start(struct bio *bp) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_shsec_softc *sc; struct bio *cbp; uint32_t *dst; ssize_t len; u_int no; int error; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_shsec_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_SHSEC_LOGREQ(2, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_FLUSH: case BIO_SPEEDUP: /* * Only those requests are supported. */ break; case BIO_DELETE: case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } /* * Allocate all bios first and calculate XOR. */ dst = NULL; len = bp->bio_length; if (bp->bio_cmd == BIO_READ) bp->bio_pflags = G_SHSEC_BFLAG_FIRST; for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_shsec_done; cbp->bio_data = uma_zalloc(g_shsec_zone, M_NOWAIT); if (cbp->bio_data == NULL) { g_shsec_alloc_failed++; error = ENOMEM; goto failure; } cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd == BIO_WRITE) { if (no == 0) { dst = (uint32_t *)cbp->bio_data; bcopy(bp->bio_data, dst, len); } else { g_shsec_xor2((uint32_t *)cbp->bio_data, dst, len); } } } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_SHSEC_LOGREQ(2, cbp, "Sending request."); g_io_request(cbp, cp); } return; failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; if (cbp->bio_data != NULL) { explicit_bzero(cbp->bio_data, cbp->bio_length); uma_zfree(g_shsec_zone, cbp->bio_data); } g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } static void g_shsec_check_and_run(struct g_shsec_softc *sc) { off_t mediasize, ms; u_int no, sectorsize = 0; if (g_shsec_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "shsec/%s", sc->sc_name); /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; mediasize -= sc->sc_disks[0]->provider->sectorsize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { ms = sc->sc_disks[no]->provider->mediasize; ms -= sc->sc_disks[no]->provider->sectorsize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, sc->sc_disks[no]->provider->sectorsize); } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize; g_error_provider(sc->sc_provider, 0); G_SHSEC_DEBUG(0, "Device %s activated.", sc->sc_name); } static int g_shsec_read_metadata(struct g_consumer *cp, struct g_shsec_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ shsec_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_shsec_add_disk(struct g_shsec_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; struct g_shsec_metadata md; int error; /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } /* Reread metadata. */ error = g_shsec_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_SHSEC_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } cp->private = sc; cp->index = no; sc->sc_disks[no] = cp; G_SHSEC_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_shsec_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_shsec_create(struct g_class *mp, const struct g_shsec_metadata *md) { struct g_shsec_softc *sc; struct g_geom *gp; u_int no; G_SHSEC_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_SHSEC_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_SHSEC_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_SHSEC, M_WAITOK | M_ZERO); gp->start = g_shsec_start; gp->spoiled = g_shsec_orphan; gp->orphan = g_shsec_orphan; gp->access = g_shsec_access; gp->dumpconf = g_shsec_dumpconf; sc->sc_id = md->md_id; sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_SHSEC, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_SHSEC_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force) { struct g_provider *pp; struct g_geom *gp; u_int no; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_SHSEC_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_SHSEC_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } for (no = 0; no < sc->sc_ndisks; no++) { if (sc->sc_disks[no] != NULL) g_shsec_remove_disk(sc->sc_disks[no]); } gp = sc->sc_geom; gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_SHSEC); free(sc, M_SHSEC); pp = LIST_FIRST(&gp->provider); if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) G_SHSEC_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_shsec_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_shsec_softc *sc; sc = gp->softc; return (g_shsec_destroy(sc, 0)); } static struct g_geom * g_shsec_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_shsec_metadata md; struct g_shsec_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_SHSEC_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "shsec:taste"); gp->start = g_shsec_start; gp->access = g_shsec_access; gp->orphan = g_shsec_orphan; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) { error = g_shsec_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0) return (NULL); if (md.md_version > G_SHSEC_VERSION) { G_SHSEC_DEBUG(0, "Kernel module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 1) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_shsec_create(mp, &md); if (gp == NULL) { G_SHSEC_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_shsec_add_disk(sc, pp, md.md_no); if (error != 0) { G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_shsec_destroy(sc, 1); return (NULL); } } return (gp); } static struct g_shsec_softc * g_shsec_find_device(struct g_class *mp, const char *name) { struct g_shsec_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_shsec_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_shsec_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_shsec_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_shsec_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_shsec_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SHSEC_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "stop") == 0) { g_shsec_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_shsec_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_shsec_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_shsec_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_printf(sb, "UP"); else sbuf_printf(sb, "DOWN"); sbuf_printf(sb, "\n"); } } DECLARE_GEOM_CLASS(g_shsec_class, g_shsec); MODULE_VERSION(geom_shsec, 0); diff --git a/sys/geom/stripe/g_stripe.c b/sys/geom/stripe/g_stripe.c index 9b4df1b8dba6..ec3bfb28b6a5 100644 --- a/sys/geom/stripe/g_stripe.c +++ b/sys/geom/stripe/g_stripe.c @@ -1,1261 +1,1261 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2005 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(geom_stripe, "GEOM striping support"); static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data"); static uma_zone_t g_stripe_zone; static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force); static int g_stripe_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_stripe_taste; static g_ctl_req_t g_stripe_config; static g_dumpconf_t g_stripe_dumpconf; static g_init_t g_stripe_init; static g_fini_t g_stripe_fini; struct g_class g_stripe_class = { .name = G_STRIPE_CLASS_NAME, .version = G_VERSION, .ctlreq = g_stripe_config, .taste = g_stripe_taste, .destroy_geom = g_stripe_destroy_geom, .init = g_stripe_init, .fini = g_stripe_fini }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_STRIPE stuff"); static u_int g_stripe_debug = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0, "Debug level"); static int g_stripe_fast = 0; SYSCTL_INT(_kern_geom_stripe, OID_AUTO, fast, CTLFLAG_RWTUN, &g_stripe_fast, 0, "Fast, but memory-consuming, mode"); static u_long g_stripe_maxmem; SYSCTL_ULONG(_kern_geom_stripe, OID_AUTO, maxmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &g_stripe_maxmem, 0, "Maximum memory that can be allocated in \"fast\" mode (in bytes)"); static u_int g_stripe_fast_failed = 0; SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD, &g_stripe_fast_failed, 0, "How many times \"fast\" mode failed"); /* * Greatest Common Divisor. */ static u_int gcd(u_int a, u_int b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static u_int lcm(u_int a, u_int b) { return ((a * b) / gcd(a, b)); } static void g_stripe_init(struct g_class *mp __unused) { g_stripe_maxmem = maxphys * 100; TUNABLE_ULONG_FETCH("kern.geom.stripe.maxmem,", &g_stripe_maxmem); g_stripe_zone = uma_zcreate("g_stripe_zone", maxphys, NULL, NULL, NULL, NULL, 0, 0); g_stripe_maxmem -= g_stripe_maxmem % maxphys; uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / maxphys); } static void g_stripe_fini(struct g_class *mp __unused) { uma_zdestroy(g_stripe_zone); } /* * Return the number of valid disks. */ static u_int g_stripe_nvalid(struct g_stripe_softc *sc) { u_int i, no; no = 0; for (i = 0; i < sc->sc_ndisks; i++) { if (sc->sc_disks[i] != NULL) no++; } return (no); } static void g_stripe_remove_disk(struct g_consumer *cp) { struct g_stripe_softc *sc; g_topology_assert(); KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__)); sc = (struct g_stripe_softc *)cp->geom->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); if (cp->private == NULL) { G_STRIPE_DEBUG(0, "Disk %s removed from %s.", cp->provider->name, sc->sc_name); cp->private = (void *)(uintptr_t)-1; } if (sc->sc_provider != NULL) { G_STRIPE_DEBUG(0, "Device %s deactivated.", sc->sc_provider->name); g_wither_provider(sc->sc_provider, ENXIO); sc->sc_provider = NULL; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) return; sc->sc_disks[cp->index] = NULL; cp->index = 0; g_detach(cp); g_destroy_consumer(cp); /* If there are no valid disks anymore, remove device. */ if (LIST_EMPTY(&sc->sc_geom->consumer)) g_stripe_destroy(sc, 1); } static void g_stripe_orphan(struct g_consumer *cp) { struct g_stripe_softc *sc; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; g_stripe_remove_disk(cp); } static int g_stripe_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp1, *cp2, *tmp; - struct g_stripe_softc *sc; + struct g_stripe_softc *sc __diagused; struct g_geom *gp; int error; g_topology_assert(); gp = pp->geom; sc = gp->softc; KASSERT(sc != NULL, ("NULL sc in %s.", __func__)); /* On first open, grab an extra "exclusive" bit */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... and let go of it on last close */ if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0) de--; LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) { error = g_access(cp1, dr, dw, de); if (error != 0) goto fail; if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 && cp1->private != NULL) { g_stripe_remove_disk(cp1); /* May destroy geom. */ } } return (0); fail: LIST_FOREACH(cp2, &gp->consumer, consumer) { if (cp1 == cp2) break; g_access(cp2, -dr, -dw, -de); } return (error); } static void g_stripe_copy(struct g_stripe_softc *sc, char *src, char *dst, off_t offset, off_t length, int mode) { off_t stripesize; size_t len; stripesize = sc->sc_stripesize; len = (size_t)(stripesize - (offset & (stripesize - 1))); do { bcopy(src, dst, len); if (mode) { dst += len + stripesize * (sc->sc_ndisks - 1); src += len; } else { dst += len; src += len + stripesize * (sc->sc_ndisks - 1); } length -= len; KASSERT(length >= 0, ("Length < 0 (stripesize=%ju, offset=%ju, length=%jd).", (uintmax_t)stripesize, (uintmax_t)offset, (intmax_t)length)); if (length > stripesize) len = stripesize; else len = length; } while (length > 0); } static void g_stripe_done(struct bio *bp) { struct g_stripe_softc *sc; struct bio *pbp; pbp = bp->bio_parent; sc = pbp->bio_to->geom->softc; if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) { g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset, bp->bio_length, 1); bp->bio_data = bp->bio_caller1; bp->bio_caller1 = NULL; } mtx_lock(&sc->sc_lock); if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += bp->bio_completed; pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { mtx_unlock(&sc->sc_lock); if (pbp->bio_driver1 != NULL) uma_zfree(g_stripe_zone, pbp->bio_driver1); if (bp->bio_cmd == BIO_SPEEDUP) pbp->bio_completed = pbp->bio_length; g_io_deliver(pbp, pbp->bio_error); } else mtx_unlock(&sc->sc_lock); g_destroy_bio(bp); } static int g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_stripe_softc *sc; char *addr, *data = NULL; struct bio *cbp; off_t stripesize; u_int nparts = 0; int error; sc = bp->bio_to->geom->softc; addr = bp->bio_data; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; cbp->bio_length = length; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize, addr += stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } if (nparts >= sc->sc_ndisks) { cbp = TAILQ_NEXT(cbp, bio_queue); if (cbp == NULL) cbp = TAILQ_FIRST(&queue); nparts++; /* * Update bio structure. */ /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length += MIN(stripesize, length); if (cbp->bio_caller1 == NULL) { cbp->bio_caller1 = cbp->bio_data; cbp->bio_data = NULL; if (data == NULL) { data = uma_zalloc(g_stripe_zone, M_NOWAIT); if (data == NULL) { error = ENOMEM; goto failure; } } } } else { cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); nparts++; /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_data = addr; cbp->bio_caller1 = NULL; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); cbp->bio_caller2 = sc->sc_disks[no]; } } if (data != NULL) bp->bio_driver1 = data; /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; if (cbp->bio_caller1 != NULL) { cbp->bio_data = data; if (bp->bio_cmd == BIO_WRITE) { g_stripe_copy(sc, cbp->bio_caller1, data, cbp->bio_offset, cbp->bio_length, 0); } data += cbp->bio_length; } G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: if (data != NULL) uma_zfree(g_stripe_zone, data); while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); if (cbp->bio_caller1 != NULL) { cbp->bio_data = cbp->bio_caller1; cbp->bio_caller1 = NULL; } bp->bio_children--; g_destroy_bio(cbp); } return (error); } static int g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct g_stripe_softc *sc; off_t stripesize; struct bio *cbp; char *addr; int error; sc = bp->bio_to->geom->softc; stripesize = sc->sc_stripesize; cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ if (bp->bio_length == length) cbp->bio_done = g_std_done; /* Optimized lockless case. */ else cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; cbp->bio_length = length; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; addr = NULL; } else addr = bp->bio_data; cbp->bio_caller2 = sc->sc_disks[no]; /* offset -= offset % stripesize; */ offset -= offset & (stripesize - 1); if (bp->bio_cmd != BIO_DELETE) addr += length; length = bp->bio_length - length; for (no++; length > 0; no++, length -= stripesize) { if (no > sc->sc_ndisks - 1) { no = 0; offset += stripesize; } cbp = g_clone_bio(bp); if (cbp == NULL) { error = ENOMEM; goto failure; } TAILQ_INSERT_TAIL(&queue, cbp, bio_queue); /* * Fill in the component buf structure. */ cbp->bio_done = g_stripe_done; cbp->bio_offset = offset; /* * MIN() is in case when * (bp->bio_length % sc->sc_stripesize) != 0. */ cbp->bio_length = MIN(stripesize, length); if ((bp->bio_flags & BIO_UNMAPPED) != 0) { cbp->bio_ma_offset += (uintptr_t)addr; cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; cbp->bio_ma_offset %= PAGE_SIZE; cbp->bio_ma_n = round_page(cbp->bio_ma_offset + cbp->bio_length) / PAGE_SIZE; } else cbp->bio_data = addr; cbp->bio_caller2 = sc->sc_disks[no]; if (bp->bio_cmd != BIO_DELETE) addr += stripesize; } /* * Fire off all allocated requests! */ while ((cbp = TAILQ_FIRST(&queue)) != NULL) { struct g_consumer *cp; TAILQ_REMOVE(&queue, cbp, bio_queue); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; cbp->bio_to = cp->provider; G_STRIPE_LOGREQ(cbp, "Sending request."); g_io_request(cbp, cp); } return (0); failure: while ((cbp = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, cbp, bio_queue); bp->bio_children--; g_destroy_bio(cbp); } return (error); } static void g_stripe_pushdown(struct g_stripe_softc *sc, struct bio *bp) { struct bio_queue_head queue; struct g_consumer *cp; struct bio *cbp; u_int no; bioq_init(&queue); for (no = 0; no < sc->sc_ndisks; no++) { cbp = g_clone_bio(bp); if (cbp == NULL) { for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); g_destroy_bio(cbp); } if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_io_deliver(bp, bp->bio_error); return; } bioq_insert_tail(&queue, cbp); cbp->bio_done = g_stripe_done; cbp->bio_caller2 = sc->sc_disks[no]; cbp->bio_to = sc->sc_disks[no]->provider; } for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) { bioq_remove(&queue, cbp); G_STRIPE_LOGREQ(cbp, "Sending request."); cp = cbp->bio_caller2; cbp->bio_caller2 = NULL; g_io_request(cbp, cp); } } static void g_stripe_start(struct bio *bp) { off_t offset, start, length, nstripe, stripesize; struct g_stripe_softc *sc; u_int no; int error, fast = 0; sc = bp->bio_to->geom->softc; /* * If sc == NULL, provider's error should be set and g_stripe_start() * should not be called at all. */ KASSERT(sc != NULL, ("Provider's error should be set (error=%d)(device=%s).", bp->bio_to->error, bp->bio_to->name)); G_STRIPE_LOGREQ(bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; case BIO_SPEEDUP: case BIO_FLUSH: g_stripe_pushdown(sc, bp); return; case BIO_GETATTR: /* To which provider it should be delivered? */ default: g_io_deliver(bp, EOPNOTSUPP); return; } stripesize = sc->sc_stripesize; /* * Calculations are quite messy, but fast I hope. */ /* Stripe number. */ /* nstripe = bp->bio_offset / stripesize; */ nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits; /* Disk number. */ no = nstripe % sc->sc_ndisks; /* Start position in stripe. */ /* start = bp->bio_offset % stripesize; */ start = bp->bio_offset & (stripesize - 1); /* Start position in disk. */ /* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */ offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start; /* Length of data to operate. */ length = MIN(bp->bio_length, stripesize - start); /* * Do use "fast" mode when: * 1. "Fast" mode is ON. * and * 2. Request size is less than or equal to maxphys, * which should always be true. * and * 3. Request size is bigger than stripesize * ndisks. If it isn't, * there will be no need to send more than one I/O request to * a provider, so there is nothing to optmize. * and * 4. Request is not unmapped. * and * 5. It is not a BIO_DELETE. */ if (g_stripe_fast && bp->bio_length <= maxphys && bp->bio_length >= stripesize * sc->sc_ndisks && (bp->bio_flags & BIO_UNMAPPED) == 0 && bp->bio_cmd != BIO_DELETE) { fast = 1; } error = 0; if (fast) { error = g_stripe_start_fast(bp, no, offset, length); if (error != 0) g_stripe_fast_failed++; } /* * Do use "economic" when: * 1. "Economic" mode is ON. * or * 2. "Fast" mode failed. It can only fail if there is no memory. */ if (!fast || error != 0) error = g_stripe_start_economic(bp, no, offset, length); if (error != 0) { if (bp->bio_error == 0) bp->bio_error = error; g_io_deliver(bp, bp->bio_error); } } static void g_stripe_check_and_run(struct g_stripe_softc *sc) { struct g_provider *dp; off_t mediasize, ms; u_int no, sectorsize = 0; g_topology_assert(); if (g_stripe_nvalid(sc) != sc->sc_ndisks) return; sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s", sc->sc_name); sc->sc_provider->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; if (g_stripe_fast == 0) sc->sc_provider->flags |= G_PF_ACCEPT_UNMAPPED; /* * Find the smallest disk. */ mediasize = sc->sc_disks[0]->provider->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) mediasize -= sc->sc_disks[0]->provider->sectorsize; mediasize -= mediasize % sc->sc_stripesize; sectorsize = sc->sc_disks[0]->provider->sectorsize; for (no = 1; no < sc->sc_ndisks; no++) { dp = sc->sc_disks[no]->provider; ms = dp->mediasize; if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) ms -= dp->sectorsize; ms -= ms % sc->sc_stripesize; if (ms < mediasize) mediasize = ms; sectorsize = lcm(sectorsize, dp->sectorsize); /* A provider underneath us doesn't support unmapped */ if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) { G_STRIPE_DEBUG(1, "Cancelling unmapped " "because of %s.", dp->name); sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED; } } sc->sc_provider->sectorsize = sectorsize; sc->sc_provider->mediasize = mediasize * sc->sc_ndisks; sc->sc_provider->stripesize = sc->sc_stripesize; sc->sc_provider->stripeoffset = 0; g_error_provider(sc->sc_provider, 0); G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name); } static int g_stripe_read_metadata(struct g_consumer *cp, struct g_stripe_metadata *md) { struct g_provider *pp; u_char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); /* Decode metadata. */ stripe_metadata_decode(buf, md); g_free(buf); return (0); } /* * Add disk to given device. */ static int g_stripe_add_disk(struct g_stripe_softc *sc, struct g_provider *pp, u_int no) { struct g_consumer *cp, *fcp; struct g_geom *gp; int error; g_topology_assert(); /* Metadata corrupted? */ if (no >= sc->sc_ndisks) return (EINVAL); /* Check if disk is not already attached. */ if (sc->sc_disks[no] != NULL) return (EEXIST); gp = sc->sc_geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; cp->private = NULL; cp->index = no; error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) { struct g_stripe_metadata md; /* Reread metadata. */ error = g_stripe_read_metadata(cp, &md); if (error != 0) goto fail; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 || strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) { G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name); goto fail; } } sc->sc_disks[no] = cp; G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name); g_stripe_check_and_run(sc); return (0); fail: if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace); g_detach(cp); g_destroy_consumer(cp); return (error); } static struct g_geom * g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md, u_int type) { struct g_stripe_softc *sc; struct g_geom *gp; u_int no; g_topology_assert(); G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); /* Two disks is minimum. */ if (md->md_all < 2) { G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name); return (NULL); } #if 0 /* Stripe size have to be grater than or equal to sector size. */ if (md->md_stripesize < sectorsize) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } #endif /* Stripe size have to be power of 2. */ if (!powerof2(md->md_stripesize)) { G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name); return (NULL); } /* Check for duplicate unit */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) { G_STRIPE_DEBUG(0, "Device %s already configured.", sc->sc_name); return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK | M_ZERO); gp->start = g_stripe_start; gp->spoiled = g_stripe_orphan; gp->orphan = g_stripe_orphan; gp->access = g_stripe_access; gp->dumpconf = g_stripe_dumpconf; sc->sc_id = md->md_id; sc->sc_stripesize = md->md_stripesize; sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1); sc->sc_ndisks = md->md_all; sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks, M_STRIPE, M_WAITOK | M_ZERO); for (no = 0; no < sc->sc_ndisks; no++) sc->sc_disks[no] = NULL; sc->sc_type = type; mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF); gp->softc = sc; sc->sc_geom = gp; sc->sc_provider = NULL; G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); return (gp); } static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force) { struct g_provider *pp; struct g_consumer *cp, *cp1; struct g_geom *gp; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->sc_provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { if (force) { G_STRIPE_DEBUG(0, "Device %s is still open, so it " "can't be definitely removed.", pp->name); } else { G_STRIPE_DEBUG(1, "Device %s is still open (r%dw%de%d).", pp->name, pp->acr, pp->acw, pp->ace); return (EBUSY); } } gp = sc->sc_geom; LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) { g_stripe_remove_disk(cp); if (cp1 == NULL) return (0); /* Recursion happened. */ } if (!LIST_EMPTY(&gp->consumer)) return (EINPROGRESS); gp->softc = NULL; KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)", gp->name)); free(sc->sc_disks, M_STRIPE); mtx_destroy(&sc->sc_lock); free(sc, M_STRIPE); G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name); g_wither_geom(gp, ENXIO); return (0); } static int g_stripe_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_stripe_softc *sc; sc = gp->softc; return (g_stripe_destroy(sc, 0)); } static struct g_geom * g_stripe_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_stripe_metadata md; struct g_stripe_softc *sc; struct g_consumer *cp; struct g_geom *gp; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); G_STRIPE_DEBUG(3, "Tasting %s.", pp->name); gp = g_new_geomf(mp, "stripe:taste"); gp->start = g_stripe_start; gp->access = g_stripe_access; gp->orphan = g_stripe_orphan; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) { error = g_stripe_read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); gp = NULL; if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0) return (NULL); if (md.md_version > G_STRIPE_VERSION) { printf("geom_stripe.ko module is too old to handle %s.\n", pp->name); return (NULL); } /* * Backward compatibility: */ /* There was no md_provider field in earlier versions of metadata. */ if (md.md_version < 2) bzero(md.md_provider, sizeof(md.md_provider)); /* There was no md_provsize field in earlier versions of metadata. */ if (md.md_version < 3) md.md_provsize = pp->mediasize; if (md.md_provider[0] != '\0' && !g_compare_names(md.md_provider, pp->name)) return (NULL); if (md.md_provsize != pp->mediasize) return (NULL); /* * Let's check if device already exists. */ sc = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC) continue; if (strcmp(md.md_name, sc->sc_name) != 0) continue; if (md.md_id != sc->sc_id) continue; break; } if (gp != NULL) { G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); return (NULL); } } else { gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC); if (gp == NULL) { G_STRIPE_DEBUG(0, "Cannot create device %s.", md.md_name); return (NULL); } sc = gp->softc; G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); error = g_stripe_add_disk(sc, pp, md.md_no); if (error != 0) { G_STRIPE_DEBUG(0, "Cannot add disk %s to %s (error=%d).", pp->name, gp->name, error); g_stripe_destroy(sc, 1); return (NULL); } } return (gp); } static void g_stripe_ctl_create(struct gctl_req *req, struct g_class *mp) { u_int attached, no; struct g_stripe_metadata md; struct g_provider *pp; struct g_stripe_softc *sc; struct g_geom *gp; struct sbuf *sb; off_t *stripesize; const char *name; char param[16]; int *nargs; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 2) { gctl_error(req, "Too few arguments."); return; } strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic)); md.md_version = G_STRIPE_VERSION; name = gctl_get_asciiparam(req, "arg0"); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", 0); return; } strlcpy(md.md_name, name, sizeof(md.md_name)); md.md_id = arc4random(); md.md_no = 0; md.md_all = *nargs - 1; stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "No '%s' argument.", "stripesize"); return; } md.md_stripesize = (uint32_t)*stripesize; bzero(md.md_provider, sizeof(md.md_provider)); /* This field is not important here. */ md.md_provsize = 0; /* Check all providers are valid */ for (no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) return; } gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL); if (gp == NULL) { gctl_error(req, "Can't configure %s.", md.md_name); return; } sc = gp->softc; sb = sbuf_new_auto(); sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name); for (attached = 0, no = 1; no < *nargs; no++) { snprintf(param, sizeof(param), "arg%u", no); pp = gctl_get_provider(req, param); if (pp == NULL) { name = gctl_get_asciiparam(req, param); MPASS(name != NULL); sbuf_printf(sb, " %s", name); continue; } if (g_stripe_add_disk(sc, pp, no - 1) != 0) { G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.", no, pp->name, gp->name); sbuf_printf(sb, " %s", pp->name); continue; } attached++; } sbuf_finish(sb); if (md.md_all != attached) { g_stripe_destroy(gp->softc, 1); gctl_error(req, "%s", sbuf_data(sb)); } sbuf_delete(sb); } static struct g_stripe_softc * g_stripe_find_device(struct g_class *mp, const char *name) { struct g_stripe_softc *sc; struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(sc->sc_name, name) == 0) return (sc); } return (NULL); } static void g_stripe_ctl_destroy(struct gctl_req *req, struct g_class *mp) { struct g_stripe_softc *sc; int *force, *nargs, error; const char *name; char param[16]; u_int i; g_topology_assert(); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No '%s' argument.", "nargs"); return; } if (*nargs <= 0) { gctl_error(req, "Missing device(s)."); return; } force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No '%s' argument.", "force"); return; } for (i = 0; i < (u_int)*nargs; i++) { snprintf(param, sizeof(param), "arg%u", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%u' argument.", i); return; } sc = g_stripe_find_device(mp, name); if (sc == NULL) { gctl_error(req, "No such device: %s.", name); return; } error = g_stripe_destroy(sc, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", sc->sc_name, error); return; } } } static void g_stripe_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_STRIPE_VERSION) { gctl_error(req, "Userland and kernel parts are out of sync."); return; } if (strcmp(verb, "create") == 0) { g_stripe_ctl_create(req, mp); return; } else if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { g_stripe_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_stripe_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_stripe_softc *sc; sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { /* Nothing here. */ } else if (cp != NULL) { sbuf_printf(sb, "%s%u\n", indent, (u_int)cp->index); } else { sbuf_printf(sb, "%s%u\n", indent, (u_int)sc->sc_id); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)sc->sc_stripesize); sbuf_printf(sb, "%s", indent); switch (sc->sc_type) { case G_STRIPE_TYPE_AUTOMATIC: sbuf_cat(sb, "AUTOMATIC"); break; case G_STRIPE_TYPE_MANUAL: sbuf_cat(sb, "MANUAL"); break; default: sbuf_cat(sb, "UNKNOWN"); break; } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%sTotal=%u, Online=%u\n", indent, sc->sc_ndisks, g_stripe_nvalid(sc)); sbuf_printf(sb, "%s", indent); if (sc->sc_provider != NULL && sc->sc_provider->error == 0) sbuf_cat(sb, "UP"); else sbuf_cat(sb, "DOWN"); sbuf_cat(sb, "\n"); } } DECLARE_GEOM_CLASS(g_stripe_class, g_stripe); MODULE_VERSION(geom_stripe, 0); diff --git a/sys/geom/vinum/geom_vinum_create.c b/sys/geom/vinum/geom_vinum_create.c index 036ce82c45e8..c828fdd5ac33 100644 --- a/sys/geom/vinum/geom_vinum_create.c +++ b/sys/geom/vinum/geom_vinum_create.c @@ -1,613 +1,612 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #define DEFAULT_STRIPESIZE 262144 /* * Create a new drive object, either by user request, during taste of the drive * itself, or because it was referenced by a subdisk during taste. */ int gv_create_drive(struct gv_softc *sc, struct gv_drive *d) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; struct gv_drive *d2; struct gv_hdr *hdr; struct gv_freelist *fl; KASSERT(d != NULL, ("gv_create_drive: NULL d")); gp = sc->geom; pp = NULL; cp = cp2 = NULL; /* The drive already has a consumer if it was tasted before. */ if (d->consumer != NULL) { cp = d->consumer; cp->private = d; pp = cp->provider; } else if (!(d->flags & GV_DRIVE_REFERENCED)) { if (gv_find_drive(sc, d->name) != NULL) { G_VINUM_DEBUG(0, "drive '%s' already exists", d->name); g_free(d); return (GV_ERR_CREATE); } if (gv_find_drive_device(sc, d->device) != NULL) { G_VINUM_DEBUG(0, "provider '%s' already in use by " "gvinum", d->device); return (GV_ERR_CREATE); } pp = g_provider_by_name(d->device); if (pp == NULL) { G_VINUM_DEBUG(0, "create '%s': device '%s' disappeared", d->name, d->device); g_free(d); return (GV_ERR_CREATE); } g_topology_lock(); cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "create drive '%s': unable to attach", d->name); g_free(d); return (GV_ERR_CREATE); } g_topology_unlock(); d->consumer = cp; cp->private = d; } /* * If this was just a "referenced" drive, we're almost finished, but * insert this drive not on the head of the drives list, as * gv_drive_is_newer() expects a "real" drive from LIST_FIRST(). */ if (d->flags & GV_DRIVE_REFERENCED) { snprintf(d->device, sizeof(d->device), "???"); d2 = LIST_FIRST(&sc->drives); if (d2 == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); else LIST_INSERT_AFTER(d2, d, drive); return (0); } /* * Update access counts of the new drive to those of an already * existing drive. */ LIST_FOREACH(d2, &sc->drives, drive) { if ((d == d2) || (d2->consumer == NULL)) continue; cp2 = d2->consumer; g_topology_lock(); if ((cp2->acr || cp2->acw || cp2->ace) && (g_access(cp, cp2->acr, cp2->acw, cp2->ace) != 0)) { g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); G_VINUM_DEBUG(0, "create drive '%s': unable to update " "access counts", d->name); if (d->hdr != NULL) g_free(d->hdr); g_free(d); return (GV_ERR_CREATE); } g_topology_unlock(); break; } d->size = pp->mediasize - GV_DATA_START; d->avail = d->size; d->vinumconf = sc; LIST_INIT(&d->subdisks); LIST_INIT(&d->freelist); /* The header might have been set during taste. */ if (d->hdr == NULL) { hdr = g_malloc(sizeof(*hdr), M_WAITOK | M_ZERO); hdr->magic = GV_MAGIC; hdr->config_length = GV_CFG_LEN; getcredhostname(NULL, hdr->label.sysname, GV_HOSTNAME_LEN); strlcpy(hdr->label.name, d->name, sizeof(hdr->label.name)); microtime(&hdr->label.date_of_birth); d->hdr = hdr; } /* We also need a freelist entry. */ fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO); fl->offset = GV_DATA_START; fl->size = d->avail; LIST_INSERT_HEAD(&d->freelist, fl, freelist); d->freelist_entries = 1; if (gv_find_drive(sc, d->name) == NULL) LIST_INSERT_HEAD(&sc->drives, d, drive); gv_set_drive_state(d, GV_DRIVE_UP, 0); return (0); } int gv_create_volume(struct gv_softc *sc, struct gv_volume *v) { KASSERT(v != NULL, ("gv_create_volume: NULL v")); v->vinumconf = sc; v->flags |= GV_VOL_NEWBORN; LIST_INIT(&v->plexes); LIST_INSERT_HEAD(&sc->volumes, v, volume); v->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(v->wqueue); return (0); } int gv_create_plex(struct gv_softc *sc, struct gv_plex *p) { struct gv_volume *v; KASSERT(p != NULL, ("gv_create_plex: NULL p")); /* Find the volume this plex should be attached to. */ v = gv_find_vol(sc, p->volume); if (v == NULL) { G_VINUM_DEBUG(0, "create plex '%s': volume '%s' not found", p->name, p->volume); g_free(p); return (GV_ERR_CREATE); } if (!(v->flags & GV_VOL_NEWBORN)) p->flags |= GV_PLEX_ADDED; p->vol_sc = v; v->plexcount++; p->vinumconf = sc; p->synced = 0; p->flags |= GV_PLEX_NEWBORN; LIST_INSERT_HEAD(&v->plexes, p, in_volume); LIST_INIT(&p->subdisks); TAILQ_INIT(&p->packets); LIST_INSERT_HEAD(&sc->plexes, p, plex); p->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->bqueue); p->wqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->wqueue); p->rqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO); bioq_init(p->rqueue); return (0); } int gv_create_sd(struct gv_softc *sc, struct gv_sd *s) { struct gv_plex *p; struct gv_drive *d; KASSERT(s != NULL, ("gv_create_sd: NULL s")); /* Find the drive where this subdisk should be put on. */ d = gv_find_drive(sc, s->drive); if (d == NULL) { /* * It's possible that the subdisk references a drive that * doesn't exist yet (during the taste process), so create a * practically empty "referenced" drive. */ if (s->flags & GV_SD_TASTED) { d = g_malloc(sizeof(struct gv_drive), M_WAITOK | M_ZERO); d->flags |= GV_DRIVE_REFERENCED; strlcpy(d->name, s->drive, sizeof(d->name)); gv_create_drive(sc, d); } else { G_VINUM_DEBUG(0, "create sd '%s': drive '%s' not found", s->name, s->drive); g_free(s); return (GV_ERR_CREATE); } } /* Find the plex where this subdisk belongs to. */ p = gv_find_plex(sc, s->plex); if (p == NULL) { G_VINUM_DEBUG(0, "create sd '%s': plex '%s' not found", s->name, s->plex); g_free(s); return (GV_ERR_CREATE); } /* * First we give the subdisk to the drive, to handle autosized * values ... */ if (gv_sd_to_drive(s, d) != 0) { g_free(s); return (GV_ERR_CREATE); } /* * Then, we give the subdisk to the plex; we check if the * given values are correct and maybe adjust them. */ if (gv_sd_to_plex(s, p) != 0) { G_VINUM_DEBUG(0, "unable to give sd '%s' to plex '%s'", s->name, p->name); if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED)) LIST_REMOVE(s, from_drive); gv_free_sd(s); g_free(s); /* * If this subdisk can't be created, we won't create * the attached plex either, if it is also a new one. */ if (!(p->flags & GV_PLEX_NEWBORN)) return (GV_ERR_CREATE); gv_rm_plex(sc, p); return (GV_ERR_CREATE); } s->flags |= GV_SD_NEWBORN; s->vinumconf = sc; LIST_INSERT_HEAD(&sc->subdisks, s, sd); return (0); } /* * Create a concatenated volume from specified drives or drivegroups. */ void gv_concat(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, dcount; sc = gp->softc; dcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_CONCAT; p->stripesize = 0; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Drives are first (right now) priority */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* * Create a mirrored volume from specified drives or drivegroups. */ void gv_mirror(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; int *drives, *flags, dcount, pcount, scount; sc = gp->softc; dcount = 0; scount = 0; pcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have an even number of drives. */ if (*drives % 2 != 0) { gctl_error(req, "mirror organization must have an even number " "of drives"); return; } if (*flags & GV_FLAG_S && *drives < 4) { gctl_error(req, "must have at least 4 drives for striped plex"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plexes. */ for (pcount = 0; pcount < 2; pcount++) { p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, pcount); strlcpy(p->volume, v->name, sizeof(p->volume)); if (*flags & GV_FLAG_S) { p->org = GV_PLEX_STRIPED; p->stripesize = DEFAULT_STRIPESIZE; } else { p->org = GV_PLEX_CONCAT; p->stripesize = -1; } gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* * We just gives each even drive to plex one, and each odd to * plex two. */ scount = 0; for (dcount = pcount; dcount < *drives; dcount += 2) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s', aborting", drive); scount++; break; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, scount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); scount++; } } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } void gv_raid5(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_drive *d; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; int *drives, *flags, dcount; char *vol, *drive, buf[30]; off_t *stripesize; sc = gp->softc; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize)); if (stripesize == NULL) { gctl_error(req, "no stripesize given"); return; } if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have at least three drives. */ if (*drives < 3) { gctl_error(req, "must have at least three drives for this " "plex organisation"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_RAID5; p->stripesize = *stripesize; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Create subdisks on drives. */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } /* * Create a striped volume from specified drives or drivegroups. */ void gv_stripe(struct g_geom *gp, struct gctl_req *req) { struct gv_drive *d; struct gv_sd *s; struct gv_volume *v; struct gv_plex *p; struct gv_softc *sc; char *drive, buf[30], *vol; - int *drives, *flags, dcount, pcount; + int *drives, *flags, dcount; sc = gp->softc; dcount = 0; - pcount = 0; vol = gctl_get_param(req, "name", NULL); if (vol == NULL) { gctl_error(req, "volume name not given"); return; } flags = gctl_get_paraml(req, "flags", sizeof(*flags)); drives = gctl_get_paraml(req, "drives", sizeof(*drives)); if (drives == NULL) { gctl_error(req, "drive names not given"); return; } /* We must have at least two drives. */ if (*drives < 2) { gctl_error(req, "must have at least 2 drives"); return; } /* First we create the volume. */ v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); strlcpy(v->name, vol, sizeof(v->name)); v->state = GV_VOL_UP; gv_post_event(sc, GV_EVENT_CREATE_VOLUME, v, NULL, 0, 0); /* Then we create the plex. */ p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); strlcpy(p->volume, v->name, sizeof(p->volume)); p->org = GV_PLEX_STRIPED; p->stripesize = 262144; gv_post_event(sc, GV_EVENT_CREATE_PLEX, p, NULL, 0, 0); /* Create subdisks on drives. */ for (dcount = 0; dcount < *drives; dcount++) { snprintf(buf, sizeof(buf), "drive%d", dcount); drive = gctl_get_param(req, buf, NULL); d = gv_find_drive(sc, drive); if (d == NULL) { gctl_error(req, "No such drive '%s'", drive); continue; } s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); snprintf(s->name, sizeof(s->name), "%s.s%d", p->name, dcount); strlcpy(s->plex, p->name, sizeof(s->plex)); strlcpy(s->drive, drive, sizeof(s->drive)); s->plex_offset = -1; s->drive_offset = -1; s->size = -1; gv_post_event(sc, GV_EVENT_CREATE_SD, s, NULL, 0, 0); } gv_post_event(sc, GV_EVENT_SETUP_OBJECTS, sc, NULL, 0, 0); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } diff --git a/sys/geom/vinum/geom_vinum_init.c b/sys/geom/vinum/geom_vinum_init.c index 115a3c66d445..55899b480e63 100644 --- a/sys/geom/vinum/geom_vinum_init.c +++ b/sys/geom/vinum/geom_vinum_init.c @@ -1,390 +1,390 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static int gv_sync(struct gv_volume *); static int gv_rebuild_plex(struct gv_plex *); static int gv_init_plex(struct gv_plex *); static int gv_grow_plex(struct gv_plex *); static int gv_sync_plex(struct gv_plex *, struct gv_plex *); static struct gv_plex *gv_find_good_plex(struct gv_volume *); void gv_start_obj(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; int *argc, *initsize; char *argv, buf[20]; int i, type; argc = gctl_get_paraml(req, "argc", sizeof(*argc)); initsize = gctl_get_paraml(req, "initsize", sizeof(*initsize)); if (argc == NULL || *argc == 0) { gctl_error(req, "no arguments given"); return; } sc = gp->softc; for (i = 0; i < *argc; i++) { snprintf(buf, sizeof(buf), "argv%d", i); argv = gctl_get_param(req, buf, NULL); if (argv == NULL) continue; type = gv_object_type(sc, argv); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, argv); if (v != NULL) gv_post_event(sc, GV_EVENT_START_VOLUME, v, NULL, *initsize, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, argv); if (p != NULL) gv_post_event(sc, GV_EVENT_START_PLEX, p, NULL, *initsize, 0); break; case GV_TYPE_SD: case GV_TYPE_DRIVE: /* XXX Not implemented, but what is the use? */ gctl_error(req, "unable to start '%s' - not yet supported", argv); return; default: gctl_error(req, "unknown object '%s'", argv); return; } } } int gv_start_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_plex *up; struct gv_sd *s; int error; KASSERT(p != NULL, ("gv_start_plex: NULL p")); error = 0; v = p->vol_sc; /* RAID5 plexes can either be init, rebuilt or grown. */ if (p->org == GV_PLEX_RAID5) { if (p->state > GV_PLEX_DEGRADED) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); return (error); } } } else if (p->state == GV_PLEX_DEGRADED) { error = gv_rebuild_plex(p); } else error = gv_init_plex(p); } else { /* We want to sync from the other plex if we're down. */ if (p->state == GV_PLEX_DOWN && v->plexcount > 1) { up = gv_find_good_plex(v); if (up == NULL) { G_VINUM_DEBUG(1, "unable to find a good plex"); return (ENXIO); } g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to " "access volume: %d", up->name, error); return (error); } g_topology_unlock(); error = gv_sync_plex(p, up); if (error) return (error); /* * In case we have a stripe that is up, check whether it can be * grown. */ } else if (p->org == GV_PLEX_STRIPED && p->state != GV_PLEX_DOWN) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { error = gv_grow_plex(p); break; } } } } return (error); } int gv_start_vol(struct gv_volume *v) { struct gv_plex *p; int error; KASSERT(v != NULL, ("gv_start_vol: NULL v")); error = 0; if (v->plexcount == 0) return (ENXIO); else if (v->plexcount == 1) { p = LIST_FIRST(&v->plexes); KASSERT(p != NULL, ("gv_start_vol: NULL p on %s", v->name)); error = gv_start_plex(p); } else error = gv_sync(v); return (error); } /* Sync a plex p from the plex up. */ static int gv_sync_plex(struct gv_plex *p, struct gv_plex *up) { int error; KASSERT(p != NULL, ("%s: NULL p", __func__)); KASSERT(up != NULL, ("%s: NULL up", __func__)); if ((p == up) || (p->state == GV_PLEX_UP)) return (0); if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) { return (EINPROGRESS); } p->synced = 0; p->flags |= GV_PLEX_SYNCING; G_VINUM_DEBUG(1, "starting sync of plex %s", p->name); error = gv_sync_request(up, p, p->synced, MIN(GV_DFLT_SYNCSIZE, up->size - p->synced), BIO_READ, NULL); if (error) { G_VINUM_DEBUG(0, "error syncing plex %s", p->name); return (error); } return (0); } /* Return a good plex from volume v. */ static struct gv_plex * gv_find_good_plex(struct gv_volume *v) { struct gv_plex *up; /* Find the plex that's up. */ up = NULL; LIST_FOREACH(up, &v->plexes, in_volume) { if (up->state == GV_PLEX_UP) break; } /* Didn't find a good plex. */ return (up); } static int gv_sync(struct gv_volume *v) { - struct gv_softc *sc; + struct gv_softc *sc __diagused; struct gv_plex *p, *up; int error; KASSERT(v != NULL, ("gv_sync: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_sync: NULL sc on %s", v->name)); up = gv_find_good_plex(v); if (up == NULL) return (ENXIO); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); if (error) { g_topology_unlock(); G_VINUM_DEBUG(0, "sync from '%s' failed to access volume: %d", up->name, error); return (error); } g_topology_unlock(); /* Go through the good plex, and issue BIO's to all other plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { error = gv_sync_plex(p, up); if (error) break; } return (0); } static int gv_rebuild_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; if (p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING || p->flags & GV_PLEX_GROWING) return (EINPROGRESS); /* * Make sure that all subdisks have consumers. We won't allow a rebuild * unless every subdisk have one. */ LIST_FOREACH(s, &p->subdisks, in_plex) { d = s->drive_sc; if (d == NULL || (d->flags & GV_DRIVE_REFERENCED)) { G_VINUM_DEBUG(0, "unable to rebuild %s, subdisk(s) have" " no drives", p->name); return (ENXIO); } } p->flags |= GV_PLEX_REBUILDING; p->synced = 0; g_topology_assert_not(); g_topology_lock(); error = gv_access(p->vol_sc->provider, 1, 1, 0); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (0); } g_topology_unlock(); gv_parity_request(p, GV_BIO_REBUILD, 0); return (0); } static int gv_grow_plex(struct gv_plex *p) { struct gv_volume *v; struct gv_sd *s; off_t origsize, origlength; int error, sdcount; KASSERT(p != NULL, ("gv_grow_plex: NULL p")); v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_plex: NULL v")); if (p->flags & GV_PLEX_GROWING || p->flags & GV_PLEX_SYNCING || p->flags & GV_PLEX_REBUILDING) return (EINPROGRESS); g_topology_lock(); error = gv_access(v->provider, 1, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "unable to access provider"); return (error); } /* XXX: This routine with finding origsize is used two other places as * well, so we should create a function for it. */ sdcount = p->sdcount; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } s = LIST_FIRST(&p->subdisks); if (s == NULL) { G_VINUM_DEBUG(0, "error growing plex without subdisks"); return (GV_ERR_NOTFOUND); } p->flags |= GV_PLEX_GROWING; origsize = (sdcount - 1) * s->size; origlength = (sdcount - 1) * p->stripesize; p->synced = 0; G_VINUM_DEBUG(1, "starting growing of plex %s", p->name); gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL); return (0); } static int gv_init_plex(struct gv_plex *p) { struct gv_drive *d; struct gv_sd *s; int error; off_t start; caddr_t data; KASSERT(p != NULL, ("gv_init_plex: NULL p")); LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state == GV_SD_INITIALIZING) return (EINPROGRESS); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); s->init_size = GV_DFLT_SYNCSIZE; start = s->drive_offset + s->initialized; d = s->drive_sc; if (d == NULL) { G_VINUM_DEBUG(0, "subdisk %s has no drive yet", s->name); break; } /* * Take the lock here since we need to avoid a race in * gv_init_request if the BIO is completed before the lock is * released. */ g_topology_lock(); error = g_access(d->consumer, 0, 1, 0); g_topology_unlock(); if (error) { G_VINUM_DEBUG(0, "error accessing consumer when " "initializing %s", s->name); break; } data = g_malloc(s->init_size, M_WAITOK | M_ZERO); gv_init_request(s, start, data, s->init_size); } return (0); } diff --git a/sys/geom/vinum/geom_vinum_plex.c b/sys/geom/vinum/geom_vinum_plex.c index a7b1e1e5a8bd..84203fcad6c9 100644 --- a/sys/geom/vinum/geom_vinum_plex.c +++ b/sys/geom/vinum/geom_vinum_plex.c @@ -1,1051 +1,1049 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include static int gv_check_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static int gv_normal_parity(struct gv_plex *, struct bio *, struct gv_raid5_packet *); static void gv_plex_flush(struct gv_plex *); static int gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int); static int gv_plex_normal_request(struct gv_plex *, struct bio *, off_t, off_t, caddr_t); static void gv_post_bio(struct gv_softc *, struct bio *); void gv_plex_start(struct gv_plex *p, struct bio *bp) { struct bio *cbp; struct gv_sd *s; struct gv_raid5_packet *wp; caddr_t addr; off_t bcount, boff, len; bcount = bp->bio_length; addr = bp->bio_data; boff = bp->bio_offset; /* Walk over the whole length of the request, we might split it up. */ while (bcount > 0) { wp = NULL; /* * RAID5 plexes need special treatment, as a single request * might involve several read/write sub-requests. */ if (p->org == GV_PLEX_RAID5) { wp = gv_raid5_start(p, bp, addr, boff, bcount); if (wp == NULL) return; len = wp->length; if (TAILQ_EMPTY(&wp->bits)) g_free(wp); else if (wp->lockbase != -1) TAILQ_INSERT_TAIL(&p->packets, wp, list); /* * Requests to concatenated and striped plexes go straight * through. */ } else { len = gv_plex_normal_request(p, bp, boff, bcount, addr); } if (len < 0) return; bcount -= len; addr += len; boff += len; } /* * Fire off all sub-requests. We get the correct consumer (== drive) * to send each request to via the subdisk that was stored in * cbp->bio_caller1. */ cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { /* * RAID5 sub-requests need to come in correct order, otherwise * we trip over the parity, as it might be overwritten by * another sub-request. We abuse cbp->bio_caller2 to mark * potential overlap situations. */ if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) { /* Park the bio on the waiting queue. */ cbp->bio_pflags |= GV_BIO_ONHOLD; bioq_disksort(p->wqueue, cbp); } else { s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); } cbp = bioq_takefirst(p->bqueue); } } static int gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int growing) { struct gv_sd *s; int i, sdcount; off_t len_left, stripeend, stripeno, stripestart; switch (p->org) { case GV_PLEX_CONCAT: /* * Find the subdisk where this request starts. The subdisks in * this list must be ordered by plex_offset. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->plex_offset <= boff && s->plex_offset + s->size > boff) { *sdno = i; break; } i++; } if (s == NULL || s->drive_sc == NULL) return (GV_ERR_NOTFOUND); /* Calculate corresponding offsets on disk. */ *real_off = boff - s->plex_offset; len_left = s->size - (*real_off); KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount > len_left) ? len_left : bcount; break; case GV_PLEX_STRIPED: /* The number of the stripe where the request starts. */ stripeno = boff / p->stripesize; KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0")); /* Take growing subdisks into account when calculating. */ sdcount = gv_sdcount(p, (boff >= p->synced)); if (!(boff + bcount <= p->synced) && (p->flags & GV_PLEX_GROWING) && !growing) return (GV_ERR_ISBUSY); *sdno = stripeno % sdcount; KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0")); stripestart = (stripeno / sdcount) * p->stripesize; KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0")); stripeend = stripestart + p->stripesize; *real_off = boff - (stripeno * p->stripesize) + stripestart; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; break; default: return (GV_ERR_PLEXORG); } return (0); } /* * Prepare a normal plex request. */ static int gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff, off_t bcount, caddr_t addr) { struct gv_sd *s; struct bio *cbp; off_t real_len, real_off; int i, err, sdno; s = NULL; sdno = -1; real_len = real_off = 0; err = ENXIO; if (p == NULL || LIST_EMPTY(&p->subdisks)) goto bad; err = gv_plex_offset(p, boff, bcount, &real_off, &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW)); /* If the request was blocked, put it into wait. */ if (err == GV_ERR_ISBUSY) { bioq_disksort(p->rqueue, bp); return (-1); /* "Fail", and delay request. */ } if (err) { err = ENXIO; goto bad; } err = ENXIO; /* Find the right subdisk. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) break; i++; } /* Subdisk not found. */ if (s == NULL || s->drive_sc == NULL) goto bad; /* Now check if we can handle the request on this subdisk. */ switch (s->state) { case GV_SD_UP: /* If the subdisk is up, just continue. */ break; case GV_SD_DOWN: if (bp->bio_pflags & GV_BIO_INTERNAL) G_VINUM_DEBUG(0, "subdisk must be in the stale state in" " order to perform administrative requests"); goto bad; case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) { G_VINUM_DEBUG(0, "subdisk stale, unable to perform " "regular requests"); goto bad; } G_VINUM_DEBUG(1, "sd %s is initializing", s->name); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); break; case GV_SD_INITIALIZING: if (bp->bio_cmd == BIO_READ) goto bad; break; default: /* All other subdisk states mean it's not accessible. */ goto bad; } /* Clone the bio and adjust the offsets and sizes. */ cbp = g_clone_bio(bp); if (cbp == NULL) { err = ENOMEM; goto bad; } cbp->bio_offset = real_off + s->drive_offset; cbp->bio_length = real_len; cbp->bio_data = addr; cbp->bio_done = gv_done; cbp->bio_caller1 = s; s->drive_sc->active++; /* Store the sub-requests now and let others issue them. */ bioq_insert_tail(p->bqueue, cbp); return (real_len); bad: G_VINUM_LOGREQ(0, bp, "plex request failed."); /* Building the sub-request failed. If internal BIO, do not deliver. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (-1); } g_io_deliver(bp, err); return (-1); } /* * Handle a completed request to a striped or concatenated plex. */ void gv_plex_normal_done(struct gv_plex *p, struct bio *bp) { struct bio *pbp; pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; g_destroy_bio(bp); pbp->bio_inbed++; if (pbp->bio_children == pbp->bio_inbed) { /* Just set it to length since multiple plexes will * screw things up. */ pbp->bio_completed = pbp->bio_length; if (pbp->bio_pflags & GV_BIO_SYNCREQ) gv_sync_complete(p, pbp); else if (pbp->bio_pflags & GV_BIO_GROW) gv_grow_complete(p, pbp); else g_io_deliver(pbp, pbp->bio_error); } } /* * Handle a completed request to a RAID-5 plex. */ void gv_plex_raid5_done(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct bio *cbp, *pbp; struct gv_bioq *bq, *bq2; struct gv_raid5_packet *wp; off_t completed; int i; completed = 0; sc = p->vinumconf; wp = bp->bio_caller2; switch (bp->bio_parent->bio_cmd) { case BIO_READ: if (wp == NULL) { completed = bp->bio_completed; break; } TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); for (i = 0; i < wp->length; i++) wp->data[i] ^= bp->bio_data[i]; break; } if (TAILQ_EMPTY(&wp->bits)) { completed = wp->length; if (wp->lockbase != -1) { TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } } g_free(wp); } break; case BIO_WRITE: /* XXX can this ever happen? */ if (wp == NULL) { completed = bp->bio_completed; break; } /* Check if we need to handle parity data. */ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { if (bq->bp != bp) continue; TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); cbp = wp->parity; if (cbp != NULL) { for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= bp->bio_data[i]; } break; } /* Handle parity data. */ if (TAILQ_EMPTY(&wp->bits)) { if (bp->bio_parent->bio_pflags & GV_BIO_CHECK) i = gv_check_parity(p, bp, wp); else i = gv_normal_parity(p, bp, wp); /* All of our sub-requests have finished. */ if (i) { completed = wp->length; TAILQ_REMOVE(&p->packets, wp, list); /* Bring the waiting bios back into the game. */ pbp = bioq_takefirst(p->wqueue); while (pbp != NULL) { gv_post_bio(sc, pbp); pbp = bioq_takefirst(p->wqueue); } g_free(wp); } } break; } pbp = bp->bio_parent; if (pbp->bio_error == 0) pbp->bio_error = bp->bio_error; pbp->bio_completed += completed; /* When the original request is finished, we deliver it. */ pbp->bio_inbed++; if (pbp->bio_inbed == pbp->bio_children) { /* Hand it over for checking or delivery. */ if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_CHECK)) { gv_parity_complete(p, pbp); } else if (pbp->bio_cmd == BIO_WRITE && (pbp->bio_pflags & GV_BIO_REBUILD)) { gv_rebuild_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_INIT) { gv_init_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_SYNCREQ) { gv_sync_complete(p, pbp); } else if (pbp->bio_pflags & GV_BIO_GROW) { gv_grow_complete(p, pbp); } else { g_io_deliver(pbp, pbp->bio_error); } } /* Clean up what we allocated. */ if (bp->bio_cflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); } static int gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *pbp; struct gv_sd *s; int err, finished, i; err = 0; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { pbp = wp->parity; wp->parity = NULL; /* Check if the parity is correct. */ for (i = 0; i < wp->length; i++) { if (bp->bio_data[i] != pbp->bio_data[i]) { err = 1; break; } } /* The parity is not correct... */ if (err) { bp->bio_parent->bio_error = EAGAIN; /* ... but we rebuild it. */ if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) { s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } } /* * Clean up the BIO we would have used for rebuilding the * parity. */ if (finished) { bp->bio_parent->bio_inbed++; g_destroy_bio(pbp); } } return (finished); } static int gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp) { struct bio *cbp, *pbp; struct gv_sd *s; int finished, i; finished = 1; if (wp->waiting != NULL) { pbp = wp->waiting; wp->waiting = NULL; cbp = wp->parity; for (i = 0; i < wp->length; i++) cbp->bio_data[i] ^= pbp->bio_data[i]; s = pbp->bio_caller1; g_io_request(pbp, s->drive_sc->consumer); finished = 0; } else if (wp->parity != NULL) { cbp = wp->parity; wp->parity = NULL; s = cbp->bio_caller1; g_io_request(cbp, s->drive_sc->consumer); finished = 0; } return (finished); } /* Flush the queue with delayed requests. */ static void gv_plex_flush(struct gv_plex *p) { - struct gv_softc *sc; struct bio *bp; - sc = p->vinumconf; bp = bioq_takefirst(p->rqueue); while (bp != NULL) { gv_plex_start(p, bp); bp = bioq_takefirst(p->rqueue); } } static void gv_post_bio(struct gv_softc *sc, struct bio *bp) { KASSERT(sc != NULL, ("NULL sc")); KASSERT(bp != NULL, ("NULL bp")); mtx_lock(&sc->bqueue_mtx); bioq_disksort(sc->bqueue_down, bp); wakeup(sc); mtx_unlock(&sc->bqueue_mtx); } int gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(from != NULL, ("NULL from")); KASSERT(to != NULL, ("NULL to")); sc = from->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "sync from '%s' failed at offset " " %jd; out of memory", from->name, offset); return (ENOMEM); } bp->bio_length = length; bp->bio_done = NULL; bp->bio_pflags |= GV_BIO_SYNCREQ; bp->bio_offset = offset; bp->bio_caller1 = from; bp->bio_caller2 = to; bp->bio_cmd = type; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */ bp->bio_data = data; /* Send down next. */ gv_post_bio(sc, bp); //gv_plex_start(from, bp); return (0); } /* * Handle a finished plex sync bio. */ int gv_sync_complete(struct gv_plex *to, struct bio *bp) { struct gv_plex *from, *p; struct gv_sd *s; struct gv_volume *v; struct gv_softc *sc; off_t offset; int err; g_topology_assert_not(); err = 0; KASSERT(to != NULL, ("NULL to")); KASSERT(bp != NULL, ("NULL bp")); from = bp->bio_caller2; KASSERT(from != NULL, ("NULL from")); v = to->vol_sc; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read the next one. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); to->synced += bp->bio_length; /* If we're finished, clean up. */ if (bp->bio_offset + bp->bio_length >= from->size) { G_VINUM_DEBUG(1, "syncing of %s from %s completed", to->name, from->name); /* Update our state. */ LIST_FOREACH(s, &to->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, 0); gv_update_plex_state(to); to->flags &= ~GV_PLEX_SYNCING; to->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); } else { offset = bp->bio_offset + bp->bio_length; err = gv_sync_request(from, to, offset, MIN(bp->bio_length, from->size - offset), BIO_READ, NULL); } } g_destroy_bio(bp); /* Clean up if there was an error. */ if (err) { to->flags &= ~GV_PLEX_SYNCING; G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err); } /* Check if all plexes are synced, and lower refcounts. */ g_topology_lock(); LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { g_topology_unlock(); return (-1); } } /* If we came here, all plexes are synced, and we're free. */ gv_access(v->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "plex sync completed"); gv_volume_flush(v); return (0); } /* * Create a new bio struct for the next grow request. */ int gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type, caddr_t data) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_grow_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_grow_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "grow of %s failed creating bio: " "out of memory", p->name); return (ENOMEM); } bp->bio_cmd = type; bp->bio_done = NULL; bp->bio_error = 0; bp->bio_caller1 = p; bp->bio_offset = offset; bp->bio_length = length; bp->bio_pflags |= GV_BIO_GROW; if (data == NULL) data = g_malloc(length, M_WAITOK); bp->bio_pflags |= GV_BIO_MALLOC; bp->bio_data = data; gv_post_bio(sc, bp); //gv_plex_start(p, bp); return (0); } /* * Finish handling of a bio to a growing plex. */ void gv_grow_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; struct gv_volume *v; off_t origsize, offset; int sdcount, err; v = p->vol_sc; KASSERT(v != NULL, ("gv_grow_complete: NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("gv_grow_complete: NULL sc")); err = 0; /* If it was a read, write it. */ if (bp->bio_cmd == BIO_READ) { p->synced += bp->bio_length; err = gv_grow_request(p, bp->bio_offset, bp->bio_length, BIO_WRITE, bp->bio_data); /* If it was a write, read next. */ } else if (bp->bio_cmd == BIO_WRITE) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); /* Find the real size of the plex. */ sdcount = gv_sdcount(p, 1); s = LIST_FIRST(&p->subdisks); KASSERT(s != NULL, ("NULL s")); origsize = (s->size * (sdcount - 1)); if (bp->bio_offset + bp->bio_length >= origsize) { G_VINUM_DEBUG(1, "growing of %s completed", p->name); p->flags &= ~GV_PLEX_GROWING; LIST_FOREACH(s, &p->subdisks, in_plex) { s->flags &= ~GV_SD_GROW; gv_set_sd_state(s, GV_SD_UP, 0); } p->size = gv_plex_size(p); gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); g_topology_lock(); gv_access(v->provider, -1, -1, 0); g_topology_unlock(); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); /* Issue delayed requests. */ gv_plex_flush(p); } else { offset = bp->bio_offset + bp->bio_length; err = gv_grow_request(p, offset, MIN(bp->bio_length, origsize - offset), BIO_READ, NULL); } } g_destroy_bio(bp); if (err) { p->flags &= ~GV_PLEX_GROWING; G_VINUM_DEBUG(0, "error growing plex: error code %d", err); } } /* * Create an initialization BIO and send it off to the consumer. Assume that * we're given initialization data as parameter. */ void gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length) { struct gv_drive *d; struct g_consumer *cp; struct bio *bp, *cbp; KASSERT(s != NULL, ("gv_init_request: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_request: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_request: NULL cp")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } bp->bio_cmd = BIO_WRITE; bp->bio_data = data; bp->bio_done = NULL; bp->bio_error = 0; bp->bio_length = length; bp->bio_pflags |= GV_BIO_INIT; bp->bio_offset = start; bp->bio_caller1 = s; /* Then ofcourse, we have to clone it. */ cbp = g_clone_bio(bp); if (cbp == NULL) { G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd" " (drive offset %jd); out of memory", s->name, (intmax_t)s->initialized, (intmax_t)start); return; /* XXX: Error codes. */ } cbp->bio_done = gv_done; cbp->bio_caller1 = s; d->active++; /* Send it off to the consumer. */ g_io_request(cbp, cp); } /* * Handle a finished initialization BIO. */ void gv_init_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_drive *d; struct g_consumer *cp; struct gv_sd *s; off_t start, length; caddr_t data; int error; s = bp->bio_caller1; start = bp->bio_offset; length = bp->bio_length; error = bp->bio_error; data = bp->bio_data; KASSERT(s != NULL, ("gv_init_complete: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_init_complete: NULL d")); cp = d->consumer; KASSERT(cp != NULL, ("gv_init_complete: NULL cp")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_init_complete: NULL sc")); g_destroy_bio(bp); /* * First we need to find out if it was okay, and abort if it's not. * Then we need to free previous buffers, find out the correct subdisk, * as well as getting the correct starting point and length of the BIO. */ if (start >= s->drive_offset + s->size) { /* Free the data we initialized. */ if (data != NULL) g_free(data); g_topology_assert_not(); g_topology_lock(); g_access(cp, 0, -1, 0); g_topology_unlock(); if (error) { gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); } else { gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); s->initialized = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); G_VINUM_DEBUG(1, "subdisk '%s' init: finished " "successfully", s->name); } return; } s->initialized += length; start += length; gv_init_request(s, start, data, length); } /* * Create a new bio struct for the next parity rebuild. Used both by internal * rebuild of degraded plexes as well as user initiated rebuilds/checks. */ void gv_parity_request(struct gv_plex *p, int flags, off_t offset) { struct gv_softc *sc; struct bio *bp; KASSERT(p != NULL, ("gv_parity_request: NULL p")); sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_request: NULL sc")); bp = g_new_bio(); if (bp == NULL) { G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: " "out of memory", p->name); return; } bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_error = 0; bp->bio_length = p->stripesize; bp->bio_caller1 = p; /* * Check if it's a rebuild of a degraded plex or a user request of * parity rebuild. */ if (flags & GV_BIO_REBUILD) bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK); else if (flags & GV_BIO_CHECK) bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO); else { G_VINUM_DEBUG(0, "invalid flags given in rebuild"); return; } bp->bio_pflags = flags; bp->bio_pflags |= GV_BIO_MALLOC; /* We still have more parity to build. */ bp->bio_offset = offset; gv_post_bio(sc, bp); //gv_plex_start(p, bp); /* Send it down to the plex. */ } /* * Handle a finished parity write. */ void gv_parity_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; int error, flags; error = bp->bio_error; flags = bp->bio_pflags; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_parity_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error == EAGAIN) { G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx", (intmax_t)p->synced); } /* Any error is fatal, except EAGAIN when we're rebuilding. */ if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx " "errno %d", p->name, (intmax_t)p->synced, error); return; } else { p->synced += p->stripesize; } if (p->synced >= p->size) { /* Make sure we don't have the lock. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); /* We're finished. */ G_VINUM_DEBUG(1, "parity operation on %s finished", p->name); p->synced = 0; gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, p->synced); } /* * Handle a finished plex rebuild bio. */ void gv_rebuild_complete(struct gv_plex *p, struct bio *bp) { struct gv_softc *sc; struct gv_sd *s; int error, flags; off_t offset; error = bp->bio_error; flags = bp->bio_pflags; offset = bp->bio_offset; flags &= ~GV_BIO_MALLOC; sc = p->vinumconf; KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc")); /* Clean up what we allocated. */ if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); if (error) { g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d", p->name, (intmax_t)offset, error); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } offset += (p->stripesize * (gv_sdcount(p, 1) - 1)); if (offset >= p->size) { /* We're finished. */ g_topology_assert_not(); g_topology_lock(); gv_access(p->vol_sc->provider, -1, -1, 0); g_topology_unlock(); G_VINUM_DEBUG(1, "rebuild of %s finished", p->name); gv_save_config(p->vinumconf); p->flags &= ~GV_PLEX_REBUILDING; p->synced = 0; /* Try to up all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) gv_update_sd_state(s); gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0); gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */ return; } /* Send down next. It will determine if we need to itself. */ gv_parity_request(p, flags, offset); } diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c index 2778646467e2..43a6a8bce330 100644 --- a/sys/geom/vinum/geom_vinum_raid5.c +++ b/sys/geom/vinum/geom_vinum_raid5.c @@ -1,668 +1,665 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include static int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, int *, int *, int); static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, struct gv_raid5_packet *, caddr_t, int); static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t, int *); static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, struct bio *, caddr_t, off_t, off_t); struct gv_raid5_packet * gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct bio *cbp; struct gv_raid5_packet *wp, *wp2; struct gv_bioq *bq, *bq2; int err, delay; delay = 0; wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); wp->bio = bp; wp->waiting = NULL; wp->parity = NULL; TAILQ_INIT(&wp->bits); if (bp->bio_pflags & GV_BIO_REBUILD) err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); else if (bp->bio_pflags & GV_BIO_CHECK) err = gv_raid5_check(p, wp, bp, addr, boff, bcount); else err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); /* Means we have a delayed request. */ if (delay) { g_free(wp); return (NULL); } /* * Building the sub-request failed, we probably need to clean up a lot. */ if (err) { G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } if (wp->waiting != NULL) { if (wp->waiting->bio_cflags & GV_BIO_MALLOC) g_free(wp->waiting->bio_data); gv_drive_done(wp->waiting->bio_caller1); g_destroy_bio(wp->waiting); } if (wp->parity != NULL) { if (wp->parity->bio_cflags & GV_BIO_MALLOC) g_free(wp->parity->bio_data); gv_drive_done(wp->parity->bio_caller1); g_destroy_bio(wp->parity); } g_free(wp); TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { if (wp->bio != bp) continue; TAILQ_REMOVE(&p->packets, wp, list); TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { TAILQ_REMOVE(&wp->bits, bq, queue); g_free(bq); } g_free(wp); } cbp = bioq_takefirst(p->bqueue); while (cbp != NULL) { if (cbp->bio_cflags & GV_BIO_MALLOC) g_free(cbp->bio_data); gv_drive_done(cbp->bio_caller1); g_destroy_bio(cbp); cbp = bioq_takefirst(p->bqueue); } /* If internal, stop and reset state. */ if (bp->bio_pflags & GV_BIO_INTERNAL) { if (bp->bio_pflags & GV_BIO_MALLOC) g_free(bp->bio_data); g_destroy_bio(bp); /* Reset flags. */ p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | GV_PLEX_GROWING); return (NULL); } g_io_deliver(bp, err); return (NULL); } return (wp); } /* * Check if the stripe that the work packet wants is already being used by * some other work packet. */ int gv_stripe_active(struct gv_plex *p, struct bio *bp) { struct gv_raid5_packet *wp, *owp; int overlap; wp = bp->bio_caller2; if (wp->lockbase == -1) return (0); overlap = 0; TAILQ_FOREACH(owp, &p->packets, list) { if (owp == wp) break; if ((wp->lockbase >= owp->lockbase) && (wp->lockbase <= owp->lockbase + owp->length)) { overlap++; break; } if ((wp->lockbase <= owp->lockbase) && (wp->lockbase + wp->length >= owp->lockbase)) { overlap++; break; } } return (overlap); } static int gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); /* Find the right subdisk. */ parity = NULL; i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == psdno) { parity = s; break; } i++; } /* Parity stripe not found. */ if (parity == NULL) return (ENXIO); if (parity->state != GV_SD_UP) return (ENXIO); wp->length = real_len; wp->data = addr; wp->lockbase = real_off; /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the parity subdisk. */ if (s == parity) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Read the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; wp->waiting = cbp; /* * In case we want to rebuild the parity, create an extra BIO to write * it out. It also acts as buffer for the XOR operations. */ cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; return (0); } /* Rebuild a degraded RAID5 plex. */ static int gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct gv_sd *broken, *s; struct gv_bioq *bq; struct bio *cbp; off_t real_len, real_off; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); /* Find the right subdisk. */ broken = NULL; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_UP) broken = s; } /* Broken stripe not found. */ if (broken == NULL) return (ENXIO); switch (broken->state) { case GV_SD_UP: return (EINVAL); case GV_SD_STALE: if (!(bp->bio_pflags & GV_BIO_REBUILD)) return (ENXIO); G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); /* Set this bit now, but should be set at end. */ broken->flags |= GV_SD_CANGOUP; break; case GV_SD_REVIVING: break; default: /* All other subdisk states mean it's not accessible. */ return (ENXIO); } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing subdisks. */ if (s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); wp->parity = cbp; p->synced = boff; /* Post notification that we're finished. */ return (0); } /* Build a request group to perform (part of) a RAID5 request. */ static int gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) { - struct g_geom *gp; struct gv_sd *broken, *original, *parity, *s; struct gv_bioq *bq; struct bio *cbp; int i, psdno, sdno, type, grow; off_t real_len, real_off; - gp = bp->bio_to->geom; - if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); /* We are optimistic and assume that this request will be OK. */ #define REQ_TYPE_NORMAL 0 #define REQ_TYPE_DEGRADED 1 #define REQ_TYPE_NOPARITY 2 type = REQ_TYPE_NORMAL; original = parity = broken = NULL; /* XXX: The resize won't crash with rebuild or sync, but we should still * be aware of it. Also this should perhaps be done on rebuild/check as * well? */ /* If we're over, we must use the old. */ if (boff >= p->synced) { grow = 1; /* Or if over the resized offset, we use all drives. */ } else if (boff + bcount <= p->synced) { grow = 0; /* Else, we're in the middle, and must wait a bit. */ } else { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno, grow); /* Find the right subdisks. */ i = 0; LIST_FOREACH(s, &p->subdisks, in_plex) { if (i == sdno) original = s; if (i == psdno) parity = s; if (s->state != GV_SD_UP) broken = s; i++; } if ((original == NULL) || (parity == NULL)) return (ENXIO); /* Our data stripe is missing. */ if (original->state != GV_SD_UP) type = REQ_TYPE_DEGRADED; /* If synchronizing request, just write it if disks are stale. */ if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { type = REQ_TYPE_NORMAL; /* Our parity stripe is missing. */ } else if (parity->state != GV_SD_UP) { /* We cannot take another failure if we're already degraded. */ if (type != REQ_TYPE_NORMAL) return (ENXIO); else type = REQ_TYPE_NOPARITY; } wp->length = real_len; wp->data = addr; wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) type = REQ_TYPE_NORMAL; if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { bioq_disksort(p->rqueue, bp); *delay = 1; return (0); } switch (bp->bio_cmd) { case BIO_READ: /* * For a degraded read we need to read in all stripes except * the broken one plus the parity stripe and then recalculate * the desired data. */ if (type == REQ_TYPE_DEGRADED) { bzero(wp->data, wp->length); LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* A normal read can be fulfilled with the original subdisk. */ } else { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); } wp->lockbase = -1; break; case BIO_WRITE: /* * A degraded write means we cannot write to the original data * subdisk. Thus we need to read in all valid stripes, * recalculate the parity from the original data, and then * write the parity stripe back out. */ if (type == REQ_TYPE_DEGRADED) { /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken and the parity subdisk. */ if ((s == broken) || (s == parity)) continue; /* Skip growing if within offset. */ if (grow && s->flags & GV_SD_GROW) continue; cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* Write the parity data. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); bcopy(addr, cbp->bio_data, wp->length); wp->parity = cbp; /* * When the parity stripe is missing we just write out the data. */ } else if (type == REQ_TYPE_NOPARITY) { cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* * A normal write request goes to the original subdisk, then we * read in all other stripes, recalculate the parity and write * out the parity again. */ } else { /* Read old parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Read old data. */ cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); cbp->bio_cmd = BIO_READ; bioq_insert_tail(p->bqueue, cbp); bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); bq->bp = cbp; TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* Write new data. */ cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); if (cbp == NULL) return (ENOMEM); /* * We must not write the new data until the old data * was read, so hold this BIO back until we're ready * for it. */ wp->waiting = cbp; /* The final bio for the parity. */ cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); if (cbp == NULL) return (ENOMEM); /* Remember that this is the BIO for the parity data. */ wp->parity = cbp; } break; default: return (EINVAL); } return (0); } /* * Calculate the offsets in the various subdisks for a RAID5 request. Also take * care of new subdisks in an expanded RAID5 array. * XXX: This assumes that the new subdisks are inserted after the others (which * is okay as long as plex_offset is larger). If subdisks are inserted into the * plexlist before, we get problems. */ static int gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, off_t *real_len, int *sdno, int *psdno, int growing) { struct gv_sd *s; int sd, psd, sdcount; off_t len_left, stripeend, stripeoff, stripestart; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } /* The number of the subdisk containing the parity stripe. */ psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % sdcount; KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); /* Offset of the start address from the start of the stripe. */ stripeoff = boff % (p->stripesize * (sdcount - 1)); KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); /* The number of the subdisk where the stripe resides. */ sd = stripeoff / p->stripesize; KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); /* At or past parity subdisk. */ if (sd >= psd) sd++; /* The offset of the stripe on this subdisk. */ stripestart = (boff - stripeoff) / (sdcount - 1); KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); stripeoff %= p->stripesize; /* The offset of the request on this subdisk. */ *real_off = stripestart + stripeoff; stripeend = stripestart + p->stripesize; len_left = stripeend - *real_off; KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); *real_len = (bcount <= len_left) ? bcount : len_left; if (sdno != NULL) *sdno = sd; if (psdno != NULL) *psdno = psd; return (0); } static struct bio * gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, caddr_t addr, int use_wp) { struct bio *cbp; cbp = g_clone_bio(bp); if (cbp == NULL) return (NULL); if (addr == NULL) { cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); cbp->bio_cflags |= GV_BIO_MALLOC; } else cbp->bio_data = addr; cbp->bio_offset = wp->lockbase + s->drive_offset; cbp->bio_length = wp->length; cbp->bio_done = gv_done; cbp->bio_caller1 = s; s->drive_sc->active++; if (use_wp) cbp->bio_caller2 = wp; return (cbp); } diff --git a/sys/geom/vinum/geom_vinum_rename.c b/sys/geom/vinum/geom_vinum_rename.c index 688673268ef9..394f5afd0703 100644 --- a/sys/geom/vinum/geom_vinum_rename.c +++ b/sys/geom/vinum/geom_vinum_rename.c @@ -1,264 +1,264 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Chris Jones * All rights reserved. * * This software was developed for the FreeBSD Project by Chris Jones * thanks to the support of Google's Summer of Code program and * mentoring by Lukas Ertl. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include void gv_rename(struct g_geom *gp, struct gctl_req *req) { struct gv_softc *sc; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; char *newname, *object, *name; int *flags, type; sc = gp->softc; flags = gctl_get_paraml(req, "flags", sizeof(*flags)); if (flags == NULL) { gctl_error(req, "no flags given"); return; } newname = gctl_get_param(req, "newname", NULL); if (newname == NULL) { gctl_error(req, "no new name given"); return; } object = gctl_get_param(req, "object", NULL); if (object == NULL) { gctl_error(req, "no object given"); return; } type = gv_object_type(sc, object); switch (type) { case GV_TYPE_VOL: v = gv_find_vol(sc, object); if (v == NULL) { gctl_error(req, "unknown volume '%s'", object); return; } name = g_malloc(GV_MAXVOLNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXVOLNAME); gv_post_event(sc, GV_EVENT_RENAME_VOL, v, name, *flags, 0); break; case GV_TYPE_PLEX: p = gv_find_plex(sc, object); if (p == NULL) { gctl_error(req, "unknown plex '%s'", object); return; } name = g_malloc(GV_MAXPLEXNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXPLEXNAME); gv_post_event(sc, GV_EVENT_RENAME_PLEX, p, name, *flags, 0); break; case GV_TYPE_SD: s = gv_find_sd(sc, object); if (s == NULL) { gctl_error(req, "unknown subdisk '%s'", object); return; } name = g_malloc(GV_MAXSDNAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXSDNAME); gv_post_event(sc, GV_EVENT_RENAME_SD, s, name, *flags, 0); break; case GV_TYPE_DRIVE: d = gv_find_drive(sc, object); if (d == NULL) { gctl_error(req, "unknown drive '%s'", object); return; } name = g_malloc(GV_MAXDRIVENAME, M_WAITOK | M_ZERO); strlcpy(name, newname, GV_MAXDRIVENAME); gv_post_event(sc, GV_EVENT_RENAME_DRIVE, d, name, *flags, 0); break; default: gctl_error(req, "unknown object '%s'", object); return; } } int gv_rename_drive(struct gv_softc *sc, struct gv_drive *d, char *newname, int flags) { struct gv_sd *s; KASSERT(d != NULL, ("gv_rename_drive: NULL d")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "drive name '%s' already in use", newname); return (GV_ERR_NAMETAKEN); } strlcpy(d->name, newname, sizeof(d->name)); if (d->hdr != NULL) strlcpy(d->hdr->label.name, newname, sizeof(d->hdr->label.name)); LIST_FOREACH(s, &d->subdisks, from_drive) strlcpy(s->drive, d->name, sizeof(s->drive)); return (0); } int gv_rename_plex(struct gv_softc *sc, struct gv_plex *p, char *newname, int flags) { char newsd[GV_MAXSDNAME]; struct gv_sd *s; char *ptr; int err; KASSERT(p != NULL, ("gv_rename_plex: NULL p")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "plex name '%s' already in use", newname); return (GV_ERR_NAMETAKEN); } /* * Locate the plex number part of the plex names. * XXX: might be a good idea to sanitize input a bit more */ ptr = strrchr(newname, '.'); if (ptr == NULL) { G_VINUM_DEBUG(0, "proposed plex name '%s' is not a valid plex " "name", newname); return (GV_ERR_INVNAME); } strlcpy(p->name, newname, sizeof(p->name)); /* Fix up references and potentially rename subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { strlcpy(s->plex, p->name, sizeof(s->plex)); if (flags & GV_FLAG_R) { /* * Look for the two last dots in the string, and assume * that the old value was ok. */ ptr = strrchr(s->name, '.'); if (ptr == NULL) return (GV_ERR_INVNAME); ptr++; snprintf(newsd, sizeof(newsd), "%s.%s", p->name, ptr); err = gv_rename_sd(sc, s, newsd, flags); if (err) return (err); } } return (0); } /* * gv_rename_sd: renames a subdisk. Note that the 'flags' argument is ignored, * since there are no structures below a subdisk. Similarly, we don't have to * clean up any references elsewhere to the subdisk's name. */ int gv_rename_sd(struct gv_softc *sc, struct gv_sd *s, char *newname, int flags) { char *dot1, *dot2; KASSERT(s != NULL, ("gv_rename_sd: NULL s")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "subdisk name %s already in use", newname); return (GV_ERR_NAMETAKEN); } /* Locate the sd number part of the sd names. */ dot1 = strchr(newname, '.'); if (dot1 == NULL || (dot2 = strchr(dot1 + 1, '.')) == NULL) { G_VINUM_DEBUG(0, "proposed sd name '%s' is not a valid sd name", newname); return (GV_ERR_INVNAME); } strlcpy(s->name, newname, sizeof(s->name)); return (0); } int gv_rename_vol(struct gv_softc *sc, struct gv_volume *v, char *newname, int flags) { - struct g_provider *pp; + struct g_provider *pp __diagused; struct gv_plex *p; char newplex[GV_MAXPLEXNAME], *ptr; int err; KASSERT(v != NULL, ("gv_rename_vol: NULL v")); pp = v->provider; KASSERT(pp != NULL, ("gv_rename_vol: NULL pp")); if (gv_object_type(sc, newname) != GV_ERR_NOTFOUND) { G_VINUM_DEBUG(1, "volume name %s already in use", newname); return (GV_ERR_NAMETAKEN); } /* Rename the volume. */ strlcpy(v->name, newname, sizeof(v->name)); /* Fix up references and potentially rename plexes. */ LIST_FOREACH(p, &v->plexes, in_volume) { strlcpy(p->volume, v->name, sizeof(p->volume)); if (flags & GV_FLAG_R) { /* * Look for the last dot in the string, and assume that * the old value was ok. */ ptr = strrchr(p->name, '.'); ptr++; snprintf(newplex, sizeof(newplex), "%s.%s", v->name, ptr); err = gv_rename_plex(sc, p, newplex, flags); if (err) return (err); } } return (0); } diff --git a/sys/geom/vinum/geom_vinum_subr.c b/sys/geom/vinum/geom_vinum_subr.c index 13edf928b6ad..54dd6db95e5e 100644 --- a/sys/geom/vinum/geom_vinum_subr.c +++ b/sys/geom/vinum/geom_vinum_subr.c @@ -1,1281 +1,1281 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2004, 2007 Lukas Ertl * Copyright (c) 2007, 2009 Ulf Lilleengen * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * * Parts written by Greg Lehey * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include int gv_drive_is_newer(struct gv_softc *, struct gv_drive *); static off_t gv_plex_smallest_sd(struct gv_plex *); void gv_parse_config(struct gv_softc *sc, char *buf, struct gv_drive *d) { char *aptr, *bptr, *cptr; struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; int error, is_newer, tokens; char *token[GV_MAXARGS]; is_newer = gv_drive_is_newer(sc, d); /* Until the end of the string *buf. */ for (aptr = buf; *aptr != '\0'; aptr = bptr) { bptr = aptr; cptr = aptr; /* Separate input lines. */ while (*bptr != '\n') bptr++; *bptr = '\0'; bptr++; tokens = gv_tokenize(cptr, token, GV_MAXARGS); if (tokens <= 0) continue; if (!strcmp(token[0], "volume")) { v = gv_new_volume(tokens, token); if (v == NULL) { G_VINUM_DEBUG(0, "config parse failed volume"); break; } v2 = gv_find_vol(sc, v->name); if (v2 != NULL) { if (is_newer) { v2->state = v->state; G_VINUM_DEBUG(2, "newer volume found!"); } g_free(v); continue; } gv_create_volume(sc, v); } else if (!strcmp(token[0], "plex")) { p = gv_new_plex(tokens, token); if (p == NULL) { G_VINUM_DEBUG(0, "config parse failed plex"); break; } p2 = gv_find_plex(sc, p->name); if (p2 != NULL) { /* XXX */ if (is_newer) { p2->state = p->state; G_VINUM_DEBUG(2, "newer plex found!"); } g_free(p); continue; } error = gv_create_plex(sc, p); if (error) continue; /* * These flags were set in gv_create_plex() and are not * needed here (on-disk config parsing). */ p->flags &= ~GV_PLEX_ADDED; } else if (!strcmp(token[0], "sd")) { s = gv_new_sd(tokens, token); if (s == NULL) { G_VINUM_DEBUG(0, "config parse failed subdisk"); break; } s2 = gv_find_sd(sc, s->name); if (s2 != NULL) { /* XXX */ if (is_newer) { s2->state = s->state; G_VINUM_DEBUG(2, "newer subdisk found!"); } g_free(s); continue; } /* * Signal that this subdisk was tasted, and could * possibly reference a drive that isn't in our config * yet. */ s->flags |= GV_SD_TASTED; if (s->state == GV_SD_UP) s->flags |= GV_SD_CANGOUP; error = gv_create_sd(sc, s); if (error) continue; /* * This flag was set in gv_create_sd() and is not * needed here (on-disk config parsing). */ s->flags &= ~GV_SD_NEWBORN; s->flags &= ~GV_SD_GROW; } } } /* * Format the vinum configuration properly. If ondisk is non-zero then the * configuration is intended to be written to disk later. */ void gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix) { struct gv_drive *d; struct gv_sd *s; struct gv_plex *p; struct gv_volume *v; /* * We don't need the drive configuration if we're not writing the * config to disk. */ if (!ondisk) { LIST_FOREACH(d, &sc->drives, drive) { sbuf_printf(sb, "%sdrive %s device /dev/%s\n", prefix, d->name, d->device); } } LIST_FOREACH(v, &sc->volumes, volume) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "volume %s", v->name); if (ondisk) sbuf_printf(sb, " state %s", gv_volstate(v->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(p, &sc->plexes, plex) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "plex name %s org %s ", p->name, gv_plexorg(p->org)); if (gv_is_striped(p)) sbuf_printf(sb, "%ds ", p->stripesize / 512); if (p->vol_sc != NULL) sbuf_printf(sb, "vol %s", p->volume); if (ondisk) sbuf_printf(sb, " state %s", gv_plexstate(p->state)); sbuf_printf(sb, "\n"); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!ondisk) sbuf_printf(sb, "%s", prefix); sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset " "%jds", s->name, s->drive, s->size / 512, s->drive_offset / 512); if (s->plex_sc != NULL) { sbuf_printf(sb, " plex %s plexoffset %jds", s->plex, s->plex_offset / 512); } if (ondisk) sbuf_printf(sb, " state %s", gv_sdstate(s->state)); sbuf_printf(sb, "\n"); } } static off_t gv_plex_smallest_sd(struct gv_plex *p) { struct gv_sd *s; off_t smallest; KASSERT(p != NULL, ("gv_plex_smallest_sd: NULL p")); s = LIST_FIRST(&p->subdisks); if (s == NULL) return (-1); smallest = s->size; LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->size < smallest) smallest = s->size; } return (smallest); } /* Walk over plexes in a volume and count how many are down. */ int gv_plexdown(struct gv_volume *v) { int plexdown; struct gv_plex *p; KASSERT(v != NULL, ("gv_plexdown: NULL v")); plexdown = 0; LIST_FOREACH(p, &v->plexes, plex) { if (p->state == GV_PLEX_DOWN) plexdown++; } return (plexdown); } int gv_sd_to_plex(struct gv_sd *s, struct gv_plex *p) { struct gv_sd *s2; off_t psizeorig, remainder, smallest; /* If this subdisk was already given to this plex, do nothing. */ if (s->plex_sc == p) return (0); /* Check correct size of this subdisk. */ s2 = LIST_FIRST(&p->subdisks); /* Adjust the subdisk-size if necessary. */ if (s2 != NULL && gv_is_striped(p)) { /* First adjust to the stripesize. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } smallest = gv_plex_smallest_sd(p); /* Then take off extra if other subdisks are smaller. */ remainder = s->size - smallest; /* * Don't allow a remainder below zero for running plexes, it's too * painful, and if someone were to accidentally do this, the * resulting array might be smaller than the original... not god */ if (remainder < 0) { if (!(p->flags & GV_PLEX_NEWBORN)) { G_VINUM_DEBUG(0, "sd %s too small for plex %s!", s->name, p->name); return (GV_ERR_BADSIZE); } /* Adjust other subdisks. */ LIST_FOREACH(s2, &p->subdisks, in_plex) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s2, (remainder * -1)); } } else if (remainder > 0) { G_VINUM_DEBUG(1, "size of sd %s is to big, " "taking off %jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } /* Find the correct plex offset for this subdisk, if needed. */ if (s->plex_offset == -1) { /* * First set it to 0 to catch the case where we had a detached * subdisk that didn't get any good offset. */ s->plex_offset = 0; if (p->sdcount) { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (gv_is_striped(p)) s->plex_offset = p->sdcount * p->stripesize; else s->plex_offset = s2->plex_offset + s2->size; } } } /* There are no subdisks for this plex yet, just insert it. */ if (LIST_EMPTY(&p->subdisks)) { LIST_INSERT_HEAD(&p->subdisks, s, in_plex); /* Insert in correct order, depending on plex_offset. */ } else { LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->plex_offset < s2->plex_offset) { LIST_INSERT_BEFORE(s2, s, in_plex); break; } else if (LIST_NEXT(s2, in_plex) == NULL) { LIST_INSERT_AFTER(s2, s, in_plex); break; } } } s->plex_sc = p; /* Adjust the size of our plex. We check if the plex misses a subdisk, * so we don't make the plex smaller than it actually should be. */ psizeorig = p->size; p->size = gv_plex_size(p); /* Make sure the size is not changed. */ if (p->sddetached > 0) { if (p->size < psizeorig) { p->size = psizeorig; /* We make sure wee need another subdisk. */ if (p->sddetached == 1) p->sddetached++; } p->sddetached--; } else { if ((p->org == GV_PLEX_RAID5 || p->org == GV_PLEX_STRIPED) && !(p->flags & GV_PLEX_NEWBORN) && p->state == GV_PLEX_UP) { s->flags |= GV_SD_GROW; } p->sdcount++; } return (0); } void gv_update_vol_size(struct gv_volume *v, off_t size) { if (v == NULL) return; if (v->provider != NULL) { g_topology_lock(); v->provider->mediasize = size; g_topology_unlock(); } v->size = size; } /* Return how many subdisks that constitute the original plex. */ int gv_sdcount(struct gv_plex *p, int growing) { struct gv_sd *s; int sdcount; sdcount = p->sdcount; if (growing) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) sdcount--; } } return (sdcount); } /* Calculates the plex size. */ off_t gv_plex_size(struct gv_plex *p) { struct gv_sd *s; off_t size; int sdcount; KASSERT(p != NULL, ("gv_plex_size: NULL p")); /* Adjust the size of our plex. */ size = 0; sdcount = gv_sdcount(p, 1); switch (p->org) { case GV_PLEX_CONCAT: LIST_FOREACH(s, &p->subdisks, in_plex) size += s->size; break; case GV_PLEX_STRIPED: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? (sdcount * s->size) : 0); break; case GV_PLEX_RAID5: s = LIST_FIRST(&p->subdisks); size = ((s != NULL) ? ((sdcount - 1) * s->size) : 0); break; } return (size); } /* Returns the size of a volume. */ off_t gv_vol_size(struct gv_volume *v) { struct gv_plex *p; off_t minplexsize; KASSERT(v != NULL, ("gv_vol_size: NULL v")); p = LIST_FIRST(&v->plexes); if (p == NULL) return (0); minplexsize = p->size; LIST_FOREACH(p, &v->plexes, in_volume) { if (p->size < minplexsize) { minplexsize = p->size; } } return (minplexsize); } void gv_update_plex_config(struct gv_plex *p) { struct gv_sd *s, *s2; off_t remainder; int required_sds, state; KASSERT(p != NULL, ("gv_update_plex_config: NULL p")); /* The plex was added to an already running volume. */ if (p->flags & GV_PLEX_ADDED) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); switch (p->org) { case GV_PLEX_STRIPED: required_sds = 2; break; case GV_PLEX_RAID5: required_sds = 3; break; case GV_PLEX_CONCAT: default: required_sds = 0; break; } if (required_sds) { if (p->sdcount < required_sds) { gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } /* * The subdisks in striped plexes must all have the same size. */ s = LIST_FIRST(&p->subdisks); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s->size != s2->size) { G_VINUM_DEBUG(0, "subdisk size mismatch %s" "(%jd) <> %s (%jd)", s->name, s->size, s2->name, s2->size); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); } } LIST_FOREACH(s, &p->subdisks, in_plex) { /* Trim subdisk sizes to match the stripe size. */ remainder = s->size % p->stripesize; if (remainder) { G_VINUM_DEBUG(1, "size of sd %s is not a " "multiple of plex stripesize, taking off " "%jd bytes", s->name, (intmax_t)remainder); gv_adjust_freespace(s, remainder); } } } p->size = gv_plex_size(p); if (p->sdcount == 0) gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); else if (p->org == GV_PLEX_RAID5 && p->flags & GV_PLEX_NEWBORN) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_FORCE); /* If added to a volume, we want the plex to be down. */ state = (p->flags & GV_PLEX_ADDED) ? GV_PLEX_DOWN : GV_PLEX_UP; gv_set_plex_state(p, state, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->flags & GV_PLEX_ADDED) { LIST_FOREACH(s, &p->subdisks, in_plex) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE); p->flags &= ~GV_PLEX_ADDED; } else if (p->state == GV_PLEX_UP) { LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->flags & GV_SD_GROW) { gv_set_plex_state(p, GV_PLEX_GROWABLE, GV_SETSTATE_FORCE); break; } } } /* Our plex is grown up now. */ p->flags &= ~GV_PLEX_NEWBORN; } /* * Give a subdisk to a drive, check and adjust several parameters, adjust * freelist. */ int gv_sd_to_drive(struct gv_sd *s, struct gv_drive *d) { struct gv_sd *s2; struct gv_freelist *fl, *fl2; off_t tmp; int i; fl2 = NULL; /* Shortcut for "referenced" drives. */ if (d->flags & GV_DRIVE_REFERENCED) { s->drive_sc = d; return (0); } /* Check if this subdisk was already given to this drive. */ if (s->drive_sc != NULL) { if (s->drive_sc == d) { if (!(s->flags & GV_SD_TASTED)) { return (0); } } else { G_VINUM_DEBUG(0, "error giving subdisk '%s' to '%s' " "(already on '%s')", s->name, d->name, s->drive_sc->name); return (GV_ERR_ISATTACHED); } } /* Preliminary checks. */ if ((s->size > d->avail) || (d->freelist_entries == 0)) { G_VINUM_DEBUG(0, "not enough space on '%s' for '%s'", d->name, s->name); return (GV_ERR_NOSPACE); } /* If no size was given for this subdisk, try to auto-size it... */ if (s->size == -1) { /* Find the largest available slot. */ LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; s->size = fl->size; s->drive_offset = fl->offset; fl2 = fl; } /* No good slot found? */ if (s->size == -1) { G_VINUM_DEBUG(0, "unable to autosize '%s' on '%s'", s->name, d->name); return (GV_ERR_BADSIZE); } /* * ... or check if we have a free slot that's large enough for the * given size. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->size < s->size) continue; /* Assign drive offset, if not given. */ if (s->drive_offset == -1) s->drive_offset = fl->offset; fl2 = fl; i++; break; } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "free slots to small for '%s' on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* No drive offset given, try to calculate it. */ if (s->drive_offset == -1) { /* Add offsets and sizes from other subdisks on this drive. */ LIST_FOREACH(s2, &d->subdisks, from_drive) { s->drive_offset = s2->drive_offset + s2->size; } /* * If there are no other subdisks yet, then set the default * offset to GV_DATA_START. */ if (s->drive_offset == -1) s->drive_offset = GV_DATA_START; /* Check if we have a free slot at the given drive offset. */ } else { i = 0; LIST_FOREACH(fl, &d->freelist, freelist) { /* Yes, this subdisk fits. */ if ((fl->offset <= s->drive_offset) && (fl->offset + fl->size >= s->drive_offset + s->size)) { i++; fl2 = fl; break; } } /* Couldn't find a good free slot. */ if (i == 0) { G_VINUM_DEBUG(0, "given drive_offset for '%s' won't fit " "on '%s'", s->name, d->name); return (GV_ERR_NOSPACE); } } /* * Now that all parameters are checked and set up, we can give the * subdisk to the drive and adjust the freelist. */ /* First, adjust the freelist. */ LIST_FOREACH(fl, &d->freelist, freelist) { /* Look for the free slot that we have found before. */ if (fl != fl2) continue; /* The subdisk starts at the beginning of the free slot. */ if (fl->offset == s->drive_offset) { fl->offset += s->size; fl->size -= s->size; /* The subdisk uses the whole slot, so remove it. */ if (fl->size == 0) { d->freelist_entries--; LIST_REMOVE(fl, freelist); } /* * The subdisk does not start at the beginning of the free * slot. */ } else { tmp = fl->offset + fl->size; fl->size = s->drive_offset - fl->offset; /* * The subdisk didn't use the complete rest of the free * slot, so we need to split it. */ if (s->drive_offset + s->size != tmp) { fl2 = g_malloc(sizeof(*fl2), M_WAITOK | M_ZERO); fl2->offset = s->drive_offset + s->size; fl2->size = tmp - fl2->offset; LIST_INSERT_AFTER(fl, fl2, freelist); d->freelist_entries++; } } break; } /* * This is the first subdisk on this drive, just insert it into the * list. */ if (LIST_EMPTY(&d->subdisks)) { LIST_INSERT_HEAD(&d->subdisks, s, from_drive); /* There are other subdisks, so insert this one in correct order. */ } else { LIST_FOREACH(s2, &d->subdisks, from_drive) { if (s->drive_offset < s2->drive_offset) { LIST_INSERT_BEFORE(s2, s, from_drive); break; } else if (LIST_NEXT(s2, from_drive) == NULL) { LIST_INSERT_AFTER(s2, s, from_drive); break; } } } d->sdcount++; d->avail -= s->size; s->flags &= ~GV_SD_TASTED; /* Link back from the subdisk to this drive. */ s->drive_sc = d; return (0); } void gv_free_sd(struct gv_sd *s) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_free_sd: NULL s")); d = s->drive_sc; if (d == NULL) return; /* * First, find the free slot that's immediately before or after this * subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; if (fl->offset + fl->size == s->drive_offset) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = s->size; fl->offset = s->drive_offset; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->size += s->size; if (fl->offset > s->drive_offset) fl->offset = s->drive_offset; } d->avail += s->size; d->sdcount--; } void gv_adjust_freespace(struct gv_sd *s, off_t remainder) { struct gv_drive *d; struct gv_freelist *fl, *fl2; KASSERT(s != NULL, ("gv_adjust_freespace: NULL s")); d = s->drive_sc; KASSERT(d != NULL, ("gv_adjust_freespace: NULL d")); /* First, find the free slot that's immediately after this subdisk. */ fl = NULL; LIST_FOREACH(fl, &d->freelist, freelist) { if (fl->offset == s->drive_offset + s->size) break; } /* If there is no free slot behind this subdisk, so create one. */ if (fl == NULL) { fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); fl->size = remainder; fl->offset = s->drive_offset + s->size - remainder; if (d->freelist_entries == 0) { LIST_INSERT_HEAD(&d->freelist, fl, freelist); } else { LIST_FOREACH(fl2, &d->freelist, freelist) { if (fl->offset < fl2->offset) { LIST_INSERT_BEFORE(fl2, fl, freelist); break; } else if (LIST_NEXT(fl2, freelist) == NULL) { LIST_INSERT_AFTER(fl2, fl, freelist); break; } } } d->freelist_entries++; /* Expand the free slot we just found. */ } else { fl->offset -= remainder; fl->size += remainder; } s->size -= remainder; d->avail += remainder; } /* Check if the given plex is a striped one. */ int gv_is_striped(struct gv_plex *p) { KASSERT(p != NULL, ("gv_is_striped: NULL p")); switch(p->org) { case GV_PLEX_STRIPED: case GV_PLEX_RAID5: return (1); default: return (0); } } /* Find a volume by name. */ struct gv_volume * gv_find_vol(struct gv_softc *sc, char *name) { struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (v); } return (NULL); } /* Find a plex by name. */ struct gv_plex * gv_find_plex(struct gv_softc *sc, char *name) { struct gv_plex *p; LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (p); } return (NULL); } /* Find a subdisk by name. */ struct gv_sd * gv_find_sd(struct gv_softc *sc, char *name) { struct gv_sd *s; LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (s); } return (NULL); } /* Find a drive by name. */ struct gv_drive * gv_find_drive(struct gv_softc *sc, char *name) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (d); } return (NULL); } /* Find a drive given a device. */ struct gv_drive * gv_find_drive_device(struct gv_softc *sc, char *device) { struct gv_drive *d; LIST_FOREACH(d, &sc->drives, drive) { if(!strcmp(d->device, device)) return (d); } return (NULL); } /* Check if any consumer of the given geom is open. */ int gv_consumer_is_open(struct g_consumer *cp) { if (cp == NULL) return (0); if (cp->acr || cp->acw || cp->ace) return (1); return (0); } int gv_provider_is_open(struct g_provider *pp) { if (pp == NULL) return (0); if (pp->acr || pp->acw || pp->ace) return (1); return (0); } /* * Compare the modification dates of the drives. * Return 1 if a > b, 0 otherwise. */ int gv_drive_is_newer(struct gv_softc *sc, struct gv_drive *d) { struct gv_drive *d2; struct timeval *a, *b; KASSERT(!LIST_EMPTY(&sc->drives), ("gv_is_drive_newer: empty drive list")); a = &d->hdr->label.last_update; LIST_FOREACH(d2, &sc->drives, drive) { if ((d == d2) || (d2->state != GV_DRIVE_UP) || (d2->hdr == NULL)) continue; b = &d2->hdr->label.last_update; if (timevalcmp(a, b, >)) return (1); } return (0); } /* Return the type of object identified by string 'name'. */ int gv_object_type(struct gv_softc *sc, char *name) { struct gv_drive *d; struct gv_plex *p; struct gv_sd *s; struct gv_volume *v; LIST_FOREACH(v, &sc->volumes, volume) { if (!strncmp(v->name, name, GV_MAXVOLNAME)) return (GV_TYPE_VOL); } LIST_FOREACH(p, &sc->plexes, plex) { if (!strncmp(p->name, name, GV_MAXPLEXNAME)) return (GV_TYPE_PLEX); } LIST_FOREACH(s, &sc->subdisks, sd) { if (!strncmp(s->name, name, GV_MAXSDNAME)) return (GV_TYPE_SD); } LIST_FOREACH(d, &sc->drives, drive) { if (!strncmp(d->name, name, GV_MAXDRIVENAME)) return (GV_TYPE_DRIVE); } return (GV_ERR_NOTFOUND); } void gv_setup_objects(struct gv_softc *sc) { struct g_provider *pp; struct gv_volume *v; struct gv_plex *p; struct gv_sd *s; struct gv_drive *d; LIST_FOREACH(s, &sc->subdisks, sd) { d = gv_find_drive(sc, s->drive); if (d != NULL) gv_sd_to_drive(s, d); p = gv_find_plex(sc, s->plex); if (p != NULL) gv_sd_to_plex(s, p); gv_update_sd_state(s); } LIST_FOREACH(p, &sc->plexes, plex) { gv_update_plex_config(p); v = gv_find_vol(sc, p->volume); if (v != NULL && p->vol_sc != v) { p->vol_sc = v; v->plexcount++; LIST_INSERT_HEAD(&v->plexes, p, in_volume); } gv_update_plex_config(p); } LIST_FOREACH(v, &sc->volumes, volume) { v->size = gv_vol_size(v); if (v->provider == NULL) { g_topology_lock(); pp = g_new_providerf(sc->geom, "gvinum/%s", v->name); pp->mediasize = v->size; pp->sectorsize = 512; /* XXX */ g_error_provider(pp, 0); v->provider = pp; pp->private = v; g_topology_unlock(); } else if (v->provider->mediasize != v->size) { g_topology_lock(); v->provider->mediasize = v->size; g_topology_unlock(); } v->flags &= ~GV_VOL_NEWBORN; gv_update_vol_state(v); } } void gv_cleanup(struct gv_softc *sc) { struct gv_volume *v, *v2; struct gv_plex *p, *p2; struct gv_sd *s, *s2; struct gv_drive *d, *d2; struct gv_freelist *fl, *fl2; mtx_lock(&sc->config_mtx); LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2) { LIST_REMOVE(v, volume); g_free(v->wqueue); g_free(v); } LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) { LIST_REMOVE(p, plex); g_free(p->bqueue); g_free(p->rqueue); g_free(p->wqueue); g_free(p); } LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2) { LIST_REMOVE(s, sd); g_free(s); } LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { LIST_REMOVE(fl, freelist); g_free(fl); } LIST_REMOVE(d, drive); g_free(d->hdr); g_free(d); } mtx_destroy(&sc->config_mtx); } /* General 'attach' routine. */ int gv_attach_plex(struct gv_plex *p, struct gv_volume *v, int rename) { struct gv_sd *s; - struct gv_softc *sc; + struct gv_softc *sc __diagused; g_topology_assert(); sc = p->vinumconf; KASSERT(sc != NULL, ("NULL sc")); if (p->vol_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", p->name, p->volume); return (GV_ERR_ISATTACHED); } /* Stale all subdisks of this plex. */ LIST_FOREACH(s, &p->subdisks, in_plex) { if (s->state != GV_SD_STALE) gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); } /* Attach to volume. Make sure volume is not up and running. */ if (gv_provider_is_open(v->provider)) { G_VINUM_DEBUG(1, "unable to attach %s: volume %s is busy", p->name, v->name); return (GV_ERR_ISBUSY); } p->vol_sc = v; strlcpy(p->volume, v->name, sizeof(p->volume)); v->plexcount++; if (rename) { snprintf(p->name, sizeof(p->name), "%s.p%d", v->name, v->plexcount); } LIST_INSERT_HEAD(&v->plexes, p, in_volume); /* Get plex up again. */ gv_update_vol_size(v, gv_vol_size(v)); gv_set_plex_state(p, GV_PLEX_UP, 0); gv_save_config(p->vinumconf); return (0); } int gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename) { struct gv_sd *s2; int error, sdcount; g_topology_assert(); /* If subdisk is attached, don't do it. */ if (s->plex_sc != NULL) { G_VINUM_DEBUG(1, "unable to attach %s: already attached to %s", s->name, s->plex); return (GV_ERR_ISATTACHED); } gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE); /* First check that this subdisk has a correct offset. If none other * starts at the same, and it's correct module stripesize, it is */ if (offset != -1 && offset % p->stripesize != 0) return (GV_ERR_BADOFFSET); LIST_FOREACH(s2, &p->subdisks, in_plex) { if (s2->plex_offset == offset) return (GV_ERR_BADOFFSET); } /* Attach the subdisk to the plex at given offset. */ s->plex_offset = offset; strlcpy(s->plex, p->name, sizeof(s->plex)); sdcount = p->sdcount; error = gv_sd_to_plex(s, p); if (error) return (error); gv_update_plex_config(p); if (rename) { snprintf(s->name, sizeof(s->name), "%s.s%d", s->plex, p->sdcount); } if (p->vol_sc != NULL) gv_update_vol_size(p->vol_sc, gv_vol_size(p->vol_sc)); gv_save_config(p->vinumconf); /* We don't update the subdisk state since the user might have to * initiate a rebuild/sync first. */ return (0); } /* Detach a plex from a volume. */ int gv_detach_plex(struct gv_plex *p, int flags) { struct gv_volume *v; g_topology_assert(); v = p->vol_sc; if (v == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", p->name); return (0); /* Not an error. */ } /* * Only proceed if forced or volume inactive. */ if (!(flags & GV_FLAG_F) && (gv_provider_is_open(v->provider) || p->state == GV_PLEX_UP)) { G_VINUM_DEBUG(1, "unable to detach %s: volume %s is busy", p->name, p->volume); return (GV_ERR_ISBUSY); } v->plexcount--; /* Make sure someone don't read us when gone. */ v->last_read_plex = NULL; LIST_REMOVE(p, in_volume); p->vol_sc = NULL; memset(p->volume, 0, GV_MAXVOLNAME); gv_update_vol_size(v, gv_vol_size(v)); gv_save_config(p->vinumconf); return (0); } /* Detach a subdisk from a plex. */ int gv_detach_sd(struct gv_sd *s, int flags) { struct gv_plex *p; g_topology_assert(); p = s->plex_sc; if (p == NULL) { G_VINUM_DEBUG(1, "unable to detach %s: already detached", s->name); return (0); /* Not an error. */ } /* * Don't proceed if we're not forcing, and the plex is up, or degraded * with this subdisk up. */ if (!(flags & GV_FLAG_F) && ((p->state > GV_PLEX_DEGRADED) || ((p->state == GV_PLEX_DEGRADED) && (s->state == GV_SD_UP)))) { G_VINUM_DEBUG(1, "unable to detach %s: plex %s is busy", s->name, s->plex); return (GV_ERR_ISBUSY); } LIST_REMOVE(s, in_plex); s->plex_sc = NULL; memset(s->plex, 0, GV_MAXPLEXNAME); p->sddetached++; gv_save_config(s->vinumconf); return (0); } diff --git a/sys/geom/vinum/geom_vinum_volume.c b/sys/geom/vinum/geom_vinum_volume.c index 63b1077a26fa..fec61ee28611 100644 --- a/sys/geom/vinum/geom_vinum_volume.c +++ b/sys/geom/vinum/geom_vinum_volume.c @@ -1,168 +1,166 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Lukas Ertl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include void gv_volume_flush(struct gv_volume *v) { struct gv_softc *sc; struct bio *bp; KASSERT(v != NULL, ("NULL v")); sc = v->vinumconf; KASSERT(sc != NULL, ("NULL sc")); bp = bioq_takefirst(v->wqueue); while (bp != NULL) { gv_volume_start(sc, bp); bp = bioq_takefirst(v->wqueue); } } void gv_volume_start(struct gv_softc *sc, struct bio *bp) { - struct g_geom *gp; struct gv_volume *v; struct gv_plex *p, *lp; int numwrites; - gp = sc->geom; v = bp->bio_to->private; if (v == NULL || v->state != GV_VOL_UP) { g_io_deliver(bp, ENXIO); return; } switch (bp->bio_cmd) { case BIO_READ: /* * Try to find a good plex where we can send the request to, * round-robin-style. The plex either has to be up, or it's a * degraded RAID5 plex. Check if we have delayed requests. Put * this request on the delayed queue if so. This makes sure that * we don't read old values. */ if (bioq_first(v->wqueue) != NULL) { bioq_insert_tail(v->wqueue, bp); break; } lp = v->last_read_plex; if (lp == NULL) lp = LIST_FIRST(&v->plexes); p = LIST_NEXT(lp, in_volume); if (p == NULL) p = LIST_FIRST(&v->plexes); do { if (p == NULL) { p = lp; break; } if ((p->state > GV_PLEX_DEGRADED) || (p->state >= GV_PLEX_DEGRADED && p->org == GV_PLEX_RAID5)) break; p = LIST_NEXT(p, in_volume); if (p == NULL) p = LIST_FIRST(&v->plexes); } while (p != lp); if ((p == NULL) || (p->org == GV_PLEX_RAID5 && p->state < GV_PLEX_DEGRADED) || (p->org != GV_PLEX_RAID5 && p->state <= GV_PLEX_DEGRADED)) { g_io_deliver(bp, ENXIO); return; } v->last_read_plex = p; /* Hand it down to the plex logic. */ gv_plex_start(p, bp); break; case BIO_WRITE: case BIO_DELETE: /* Delay write-requests if any plex is synchronizing. */ LIST_FOREACH(p, &v->plexes, in_volume) { if (p->flags & GV_PLEX_SYNCING) { bioq_insert_tail(v->wqueue, bp); return; } } numwrites = 0; /* Give the BIO to each plex of this volume. */ LIST_FOREACH(p, &v->plexes, in_volume) { if (p->state < GV_PLEX_DEGRADED) continue; gv_plex_start(p, bp); numwrites++; } if (numwrites == 0) g_io_deliver(bp, ENXIO); break; } } void gv_bio_done(struct gv_softc *sc, struct bio *bp) { - struct gv_volume *v; + struct gv_volume *v __diagused; struct gv_plex *p; struct gv_sd *s; s = bp->bio_caller1; KASSERT(s != NULL, ("gv_bio_done: NULL s")); p = s->plex_sc; KASSERT(p != NULL, ("gv_bio_done: NULL p")); v = p->vol_sc; KASSERT(v != NULL, ("gv_bio_done: NULL v")); switch (p->org) { case GV_PLEX_CONCAT: case GV_PLEX_STRIPED: gv_plex_normal_done(p, bp); break; case GV_PLEX_RAID5: gv_plex_raid5_done(p, bp); break; } gv_drive_done(s->drive_sc); } diff --git a/sys/geom/virstor/g_virstor.c b/sys/geom/virstor/g_virstor.c index e27d92b509d4..6c201b24ab19 100644 --- a/sys/geom/virstor/g_virstor.c +++ b/sys/geom/virstor/g_virstor.c @@ -1,1878 +1,1876 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006-2007 Ivan Voras * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Implementation notes: * - "Components" are wrappers around providers that make up the * virtual storage (i.e. a virstor has "physical" components) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(g_virstor, "GEOM virtual storage support"); /* Declare malloc(9) label */ static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data"); /* GEOM class methods */ static g_init_t g_virstor_init; static g_fini_t g_virstor_fini; static g_taste_t g_virstor_taste; static g_ctl_req_t g_virstor_config; static g_ctl_destroy_geom_t g_virstor_destroy_geom; /* Declare & initialize class structure ("geom class") */ struct g_class g_virstor_class = { .name = G_VIRSTOR_CLASS_NAME, .version = G_VERSION, .init = g_virstor_init, .fini = g_virstor_fini, .taste = g_virstor_taste, .ctlreq = g_virstor_config, .destroy_geom = g_virstor_destroy_geom /* The .dumpconf and the rest are only usable for a geom instance, so * they will be set when such instance is created. */ }; /* Declare sysctl's and loader tunables */ SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_GVIRSTOR information"); static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */ SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RWTUN, &g_virstor_debug, 0, "Debug level (2=production, 5=normal, 15=excessive)"); static u_int g_virstor_chunk_watermark = 100; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RWTUN, &g_virstor_chunk_watermark, 0, "Minimum number of free chunks before issuing administrative warning"); static u_int g_virstor_component_watermark = 1; SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RWTUN, &g_virstor_component_watermark, 0, "Minimum number of free components before issuing administrative warning"); static int read_metadata(struct g_consumer *, struct g_virstor_metadata *); static void write_metadata(struct g_consumer *, struct g_virstor_metadata *); static int clear_metadata(struct g_virstor_component *); static int add_provider_to_geom(struct g_virstor_softc *, struct g_provider *, struct g_virstor_metadata *); static struct g_geom *create_virstor_geom(struct g_class *, struct g_virstor_metadata *); static void virstor_check_and_run(struct g_virstor_softc *); static u_int virstor_valid_components(struct g_virstor_softc *); static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t, boolean_t); static void remove_component(struct g_virstor_softc *, struct g_virstor_component *, boolean_t); static void bioq_dismantle(struct bio_queue_head *); static int allocate_chunk(struct g_virstor_softc *, struct g_virstor_component **, u_int *, u_int *); static void delay_destroy_consumer(void *, int); static void dump_component(struct g_virstor_component *comp); #if 0 static void dump_me(struct virstor_map_entry *me, unsigned int nr); #endif static void virstor_ctl_stop(struct gctl_req *, struct g_class *); static void virstor_ctl_add(struct gctl_req *, struct g_class *); static void virstor_ctl_remove(struct gctl_req *, struct g_class *); static struct g_virstor_softc * virstor_find_geom(const struct g_class *, const char *); static void update_metadata(struct g_virstor_softc *); static void fill_metadata(struct g_virstor_softc *, struct g_virstor_metadata *, u_int, u_int); static void g_virstor_orphan(struct g_consumer *); static int g_virstor_access(struct g_provider *, int, int, int); static void g_virstor_start(struct bio *); static void g_virstor_dumpconf(struct sbuf *, const char *, struct g_geom *, struct g_consumer *, struct g_provider *); static void g_virstor_done(struct bio *); static void invalid_call(void); /* * Initialise GEOM class (per-class callback) */ static void g_virstor_init(struct g_class *mp __unused) { /* Catch map struct size mismatch at compile time; Map entries must * fit into maxphys exactly, with no wasted space. */ MPASS(VIRSTOR_MAP_BLOCK_ENTRIES * VIRSTOR_MAP_ENTRY_SIZE == maxphys); /* Init UMA zones, TAILQ's, other global vars */ } /* * Finalise GEOM class (per-class callback) */ static void g_virstor_fini(struct g_class *mp __unused) { /* Deinit UMA zones & global vars */ } /* * Config (per-class callback) */ static void g_virstor_config(struct gctl_req *req, struct g_class *cp, char const *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "Failed to get 'version' argument"); return; } if (*version != G_VIRSTOR_VERSION) { gctl_error(req, "Userland and kernel versions out of sync"); return; } g_topology_unlock(); if (strcmp(verb, "add") == 0) virstor_ctl_add(req, cp); else if (strcmp(verb, "stop") == 0 || strcmp(verb, "destroy") == 0) virstor_ctl_stop(req, cp); else if (strcmp(verb, "remove") == 0) virstor_ctl_remove(req, cp); else gctl_error(req, "unknown verb: '%s'", verb); g_topology_lock(); } /* * "stop" verb from userland */ static void virstor_ctl_stop(struct gctl_req *req, struct g_class *cp) { int *force, *nargs; int i; nargs = gctl_get_paraml(req, "nargs", sizeof *nargs); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 1) { gctl_error(req, "Invalid number of arguments"); return; } force = gctl_get_paraml(req, "force", sizeof *force); if (force == NULL) { gctl_error(req, "Error fetching argument '%s'", "force"); return; } g_topology_lock(); for (i = 0; i < *nargs; i++) { char param[8]; const char *name; struct g_virstor_softc *sc; int error; snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) { gctl_error(req, "No 'arg%d' argument", i); g_topology_unlock(); return; } sc = virstor_find_geom(cp, name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", name); g_topology_unlock(); return; } LOG_MSG(LVL_INFO, "Stopping %s by the userland command", sc->geom->name); update_metadata(sc); if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) { LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d", sc->geom->name, error); } } g_topology_unlock(); } /* * "add" verb from userland - add new component(s) to the structure. * This will be done all at once in here, without going through the * .taste function for new components. */ static void virstor_ctl_add(struct gctl_req *req, struct g_class *cp) { /* Note: while this is going on, I/O is being done on * the g_up and g_down threads. The idea is to make changes * to softc members in a way that can atomically activate * them all at once. */ struct g_virstor_softc *sc; int *hardcode, *nargs; const char *geom_name; /* geom to add a component to */ struct g_consumer *fcp; struct g_virstor_bio_q *bq; u_int added; int error; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode)); if (hardcode == NULL) { gctl_error(req, "Error fetching argument '%s'", "hardcode"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot add components to incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } fcp = sc->components[0].gcons; added = 0; g_topology_lock(); for (i = 1; i < *nargs; i++) { struct g_virstor_metadata md; char aname[8]; struct g_provider *pp; struct g_consumer *cp; u_int nc; u_int j; snprintf(aname, sizeof aname, "arg%d", i); pp = gctl_get_provider(req, aname); if (pp == NULL) { /* This is the most common error so be verbose about it */ if (added != 0) { gctl_error(req, "Invalid provider. (added" " %u components)", added); update_metadata(sc); } g_topology_unlock(); return; } cp = g_new_consumer(sc->geom); if (cp == NULL) { gctl_error(req, "Cannot create consumer"); g_topology_unlock(); return; } error = g_attach(cp, pp); if (error != 0) { gctl_error(req, "Cannot attach a consumer to %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } if (fcp->acr != 0 || fcp->acw != 0 || fcp->ace != 0) { error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { gctl_error(req, "Access request failed for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } if (fcp->provider->sectorsize != pp->sectorsize) { gctl_error(req, "Sector size doesn't fit for %s", pp->name); g_destroy_consumer(cp); g_topology_unlock(); return; } for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, pp->name) == 0) { gctl_error(req, "Component %s already in %s", pp->name, sc->geom->name); g_destroy_consumer(cp); g_topology_unlock(); return; } } sc->components = realloc(sc->components, sizeof(*sc->components) * (sc->n_components + 1), M_GVIRSTOR, M_WAITOK); nc = sc->n_components; sc->components[nc].gcons = cp; sc->components[nc].sc = sc; sc->components[nc].index = nc; sc->components[nc].chunk_count = cp->provider->mediasize / sc->chunk_size; sc->components[nc].chunk_next = 0; sc->components[nc].chunk_reserved = 0; if (sc->components[nc].chunk_count < 4) { gctl_error(req, "Provider too small: %s", cp->provider->name); g_destroy_consumer(cp); g_topology_unlock(); return; } fill_metadata(sc, &md, nc, *hardcode); write_metadata(cp, &md); /* The new component becomes visible when n_components is * incremented */ sc->n_components++; added++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added, sc->geom->name); /* Fire off BIOs previously queued because there wasn't any * physical space left. If the BIOs still can't be satisfied * they will again be added to the end of the queue (during * which the mutex will be recursed) */ bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK); bq->bio = NULL; mtx_lock(&sc->delayed_bio_q_mtx); /* First, insert a sentinel to the queue end, so we don't * end up in an infinite loop if there's still no free * space available. */ STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { bq = STAILQ_FIRST(&sc->delayed_bio_q); if (bq->bio != NULL) { g_virstor_start(bq->bio); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } else { STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); break; } } mtx_unlock(&sc->delayed_bio_q_mtx); } /* * Find a geom handled by the class */ static struct g_virstor_softc * virstor_find_geom(const struct g_class *cp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &cp->geom, geom) { if (strcmp(name, gp->name) == 0) return (gp->softc); } return (NULL); } /* * Update metadata on all components to reflect the current state * of these fields: * - chunk_next * - flags * - md_count * Expects things to be set up so write_metadata() can work, i.e. * the topology lock must be held. */ static void update_metadata(struct g_virstor_softc *sc) { struct g_virstor_metadata md; u_int n; if (virstor_valid_components(sc) != sc->n_components) return; /* Incomplete device */ LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s", sc->geom->name); /* Update metadata on components */ g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, sc->geom->class->name, sc->geom->name); g_topology_assert(); for (n = 0; n < sc->n_components; n++) { read_metadata(sc->components[n].gcons, &md); md.chunk_next = sc->components[n].chunk_next; md.flags = sc->components[n].flags; md.md_count = sc->n_components; write_metadata(sc->components[n].gcons, &md); } } /* * Fills metadata (struct md) from information stored in softc and the nc'th * component of virstor */ static void fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md, u_int nc, u_int hardcode) { struct g_virstor_component *c; bzero(md, sizeof *md); c = &sc->components[nc]; strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic); md->md_version = G_VIRSTOR_VERSION; strncpy(md->md_name, sc->geom->name, sizeof md->md_name); md->md_id = sc->id; md->md_virsize = sc->virsize; md->md_chunk_size = sc->chunk_size; md->md_count = sc->n_components; if (hardcode) { strncpy(md->provider, c->gcons->provider->name, sizeof md->provider); } md->no = nc; md->provsize = c->gcons->provider->mediasize; md->chunk_count = c->chunk_count; md->chunk_next = c->chunk_next; md->chunk_reserved = c->chunk_reserved; md->flags = c->flags; } /* * Remove a component from virstor device. * Can only be done if the component is unallocated. */ static void virstor_ctl_remove(struct gctl_req *req, struct g_class *cp) { /* As this is executed in parallel to I/O, operations on virstor * structures must be as atomic as possible. */ struct g_virstor_softc *sc; int *nargs; const char *geom_name; u_int removed; int i; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "Error fetching argument '%s'", "nargs"); return; } if (*nargs < 2) { gctl_error(req, "Invalid number of arguments"); return; } /* Find "our" geom */ geom_name = gctl_get_asciiparam(req, "arg0"); if (geom_name == NULL) { gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)"); return; } sc = virstor_find_geom(cp, geom_name); if (sc == NULL) { gctl_error(req, "Don't know anything about '%s'", geom_name); return; } if (virstor_valid_components(sc) != sc->n_components) { LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete " "virstor %s", sc->geom->name); gctl_error(req, "Virstor %s is incomplete", sc->geom->name); return; } removed = 0; for (i = 1; i < *nargs; i++) { char param[8]; const char *prov_name; int j, found; struct g_virstor_component *newcomp, *compbak; snprintf(param, sizeof(param), "arg%d", i); prov_name = gctl_get_asciiparam(req, param); if (prov_name == NULL) { gctl_error(req, "Error fetching argument '%s'", param); return; } if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) prov_name += sizeof(_PATH_DEV) - 1; found = -1; for (j = 0; j < sc->n_components; j++) { if (strcmp(sc->components[j].gcons->provider->name, prov_name) == 0) { found = j; break; } } if (found == -1) { LOG_MSG(LVL_ERROR, "No %s component in %s", prov_name, sc->geom->name); continue; } compbak = sc->components; newcomp = malloc(sc->n_components * sizeof(*sc->components), M_GVIRSTOR, M_WAITOK | M_ZERO); bcopy(sc->components, newcomp, found * sizeof(*sc->components)); bcopy(&sc->components[found + 1], newcomp + found, found * sizeof(*sc->components)); if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) { LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be " "removed from %s", prov_name, sc->geom->name); free(newcomp, M_GVIRSTOR); /* We'll consider this non-fatal error */ continue; } /* Renumerate unallocated components */ for (j = 0; j < sc->n_components-1; j++) { if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) == 0) { sc->components[j].index = j; } } /* This is the critical section. If a component allocation * event happens while both variables are not yet set, * there will be trouble. Something will panic on encountering * NULL sc->components[x].gcomp member. * Luckily, component allocation happens very rarely and * removing components is an abnormal action in any case. */ sc->components = newcomp; sc->n_components--; /* End critical section */ g_topology_lock(); if (clear_metadata(&compbak[found]) != 0) { LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear " "metadata on %s", prov_name); } g_detach(compbak[found].gcons); g_destroy_consumer(compbak[found].gcons); g_topology_unlock(); free(compbak, M_GVIRSTOR); removed++; } /* This call to update_metadata() is critical. In case there's a * power failure in the middle of it and some components are updated * while others are not, there will be trouble on next .taste() iff * a non-updated component is detected first */ g_topology_lock(); update_metadata(sc); g_topology_unlock(); LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed, sc->geom->name); } /* * Clear metadata sector on component */ static int clear_metadata(struct g_virstor_component *comp) { char *buf; int error; LOG_MSG(LVL_INFO, "Clearing metadata on %s", comp->gcons->provider->name); g_topology_assert(); error = g_access(comp->gcons, 0, 1, 0); if (error != 0) return (error); buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR, M_WAITOK | M_ZERO); error = g_write_data(comp->gcons, comp->gcons->provider->mediasize - comp->gcons->provider->sectorsize, buf, comp->gcons->provider->sectorsize); free(buf, M_GVIRSTOR); g_access(comp->gcons, 0, -1, 0); return (error); } /* * Destroy geom forcibly. */ static int g_virstor_destroy_geom(struct gctl_req *req __unused, struct g_class *mp, struct g_geom *gp) { struct g_virstor_softc *sc; int exitval; sc = gp->softc; KASSERT(sc != NULL, ("%s: NULL sc", __func__)); exitval = 0; LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name, gp->softc); if (sc != NULL) { #ifdef INVARIANTS char *buf; int error; off_t off; int isclean, count; int n; LOG_MSG(LVL_INFO, "INVARIANTS detected"); LOG_MSG(LVL_INFO, "Verifying allocation " "table for %s", sc->geom->name); count = 0; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].flags || VIRSTOR_MAP_ALLOCATED != 0) count++; } LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks", sc->geom->name, count); n = off = count = 0; isclean = 1; if (virstor_valid_components(sc) != sc->n_components) { /* This is a incomplete virstor device (not all * components have been found) */ LOG_MSG(LVL_ERROR, "Device %s is incomplete", sc->geom->name); goto bailout; } error = g_access(sc->components[0].gcons, 1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d)", __func__, error)); /* Compare the whole on-disk allocation table with what's * currently in memory */ while (n < sc->chunk_count) { buf = g_read_data(sc->components[0].gcons, off, sc->sectorsize, &error); KASSERT(buf != NULL, ("g_read_data returned NULL (%d) " "for read at %jd", error, off)); if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) { LOG_MSG(LVL_ERROR, "ERROR in allocation table, " "entry %d, offset %jd", n, off); isclean = 0; count++; } n += sc->me_per_sector; off += sc->sectorsize; g_free(buf); } error = g_access(sc->components[0].gcons, -1, 0, 0); KASSERT(error == 0, ("%s: g_access failed (%d) on exit", __func__, error)); if (isclean != 1) { LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s " "(%d sectors don't match, max %zu allocations)", sc->geom->name, count, count * sc->me_per_sector); } else { LOG_MSG(LVL_INFO, "Allocation table ok for %s", sc->geom->name); } bailout: #endif update_metadata(sc); virstor_geom_destroy(sc, FALSE, FALSE); exitval = EAGAIN; } else exitval = 0; return (exitval); } /* * Taste event (per-class callback) * Examines a provider and creates geom instances if needed */ static struct g_geom * g_virstor_taste(struct g_class *mp, struct g_provider *pp, int flags) { struct g_virstor_metadata md; struct g_geom *gp; struct g_consumer *cp; struct g_virstor_softc *sc; int error; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name); /* We need a dummy geom to attach a consumer to the given provider */ gp = g_new_geomf(mp, "virstor:taste.helper"); gp->start = (void *)invalid_call; /* XXX: hacked up so the */ gp->access = (void *)invalid_call; /* compiler doesn't complain. */ gp->orphan = (void *)invalid_call; /* I really want these to fail. */ cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error == 0) { error = read_metadata(cp, &md); g_detach(cp); } g_destroy_consumer(cp); g_destroy_geom(gp); if (error != 0) return (NULL); if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0) return (NULL); if (md.md_version != G_VIRSTOR_VERSION) { LOG_MSG(LVL_ERROR, "Kernel module version invalid " "to handle %s (%s) : %d should be %d", md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION); return (NULL); } if (md.provsize != pp->mediasize) return (NULL); /* If the provider name is hardcoded, use the offered provider only * if it's been offered with its proper name (the one used in * the label command). */ if (md.provider[0] != '\0' && !g_compare_names(md.provider, pp->name)) return (NULL); /* Iterate all geoms this class already knows about to see if a new * geom instance of this class needs to be created (in case the provider * is first from a (possibly) multi-consumer geom) or it just needs * to be added to an existing instance. */ sc = NULL; gp = NULL; LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc == NULL) continue; if (strcmp(md.md_name, sc->geom->name) != 0) continue; if (md.md_id != sc->id) continue; break; } if (gp != NULL) { /* We found an existing geom instance; add to it */ LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); return (NULL); } } else { /* New geom instance needs to be created */ gp = create_virstor_geom(mp, &md); if (gp == NULL) { LOG_MSG(LVL_ERROR, "Error creating new instance of " "class %s: %s", mp->name, md.md_name); LOG_MSG(LVL_DEBUG, "Error creating %s at %s", md.md_name, pp->name); return (NULL); } sc = gp->softc; LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name, md.md_name); error = add_provider_to_geom(sc, pp, &md); if (error != 0) { LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)", pp->name, md.md_name, error); virstor_geom_destroy(sc, TRUE, FALSE); return (NULL); } } return (gp); } /* * Destroyes consumer passed to it in arguments. Used as a callback * on g_event queue. */ static void delay_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *c = arg; KASSERT(c != NULL, ("%s: invalid consumer", __func__)); LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay", c->provider->name); g_detach(c); g_destroy_consumer(c); } /* * Remove a component (consumer) from geom instance; If it's the first * component being removed, orphan the provider to announce geom's being * dismantled */ static void remove_component(struct g_virstor_softc *sc, struct g_virstor_component *comp, boolean_t delay) { struct g_consumer *c; KASSERT(comp->gcons != NULL, ("Component with no consumer in %s", sc->geom->name)); c = comp->gcons; comp->gcons = NULL; KASSERT(c->provider != NULL, ("%s: no provider", __func__)); LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name, sc->geom->name); if (sc->provider != NULL) { LOG_MSG(LVL_INFO, "Removing provider %s", sc->provider->name); g_wither_provider(sc->provider, ENXIO); sc->provider = NULL; } if (c->acr > 0 || c->acw > 0 || c->ace > 0) return; if (delay) { /* Destroy consumer after it's tasted */ g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL); } else { g_detach(c); g_destroy_consumer(c); } } /* * Destroy geom - called internally * See g_virstor_destroy_geom for the other one */ static int virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force, boolean_t delay) { struct g_provider *pp; struct g_geom *gp; u_int n; g_topology_assert(); if (sc == NULL) return (ENXIO); pp = sc->provider; if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { LOG_MSG(force ? LVL_WARNING : LVL_ERROR, "Device %s is still open.", pp->name); if (!force) return (EBUSY); } for (n = 0; n < sc->n_components; n++) { if (sc->components[n].gcons != NULL) remove_component(sc, &sc->components[n], delay); } gp = sc->geom; gp->softc = NULL; KASSERT(sc->provider == NULL, ("Provider still exists for %s", gp->name)); /* XXX: This might or might not work, since we're called with * the topology lock held. Also, it might panic the kernel if * the error'd BIO is in softupdates code. */ mtx_lock(&sc->delayed_bio_q_mtx); while (!STAILQ_EMPTY(&sc->delayed_bio_q)) { struct g_virstor_bio_q *bq; bq = STAILQ_FIRST(&sc->delayed_bio_q); bq->bio->bio_error = ENOSPC; g_io_deliver(bq->bio, EIO); STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage); free(bq, M_GVIRSTOR); } mtx_unlock(&sc->delayed_bio_q_mtx); mtx_destroy(&sc->delayed_bio_q_mtx); free(sc->map, M_GVIRSTOR); free(sc->components, M_GVIRSTOR); bzero(sc, sizeof *sc); free(sc, M_GVIRSTOR); pp = LIST_FIRST(&gp->provider); /* We only offer one provider */ if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)) LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name); g_wither_geom(gp, ENXIO); return (0); } /* * Utility function: read metadata & decode. Wants topology lock to be * held. */ static int read_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; g_topology_assert(); error = g_access(cp, 1, 0, 0); if (error != 0) return (error); pp = cp->provider; g_topology_unlock(); buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); g_topology_lock(); g_access(cp, -1, 0, 0); if (buf == NULL) return (error); virstor_metadata_decode(buf, md); g_free(buf); return (0); } /** * Utility function: encode & write metadata. Assumes topology lock is * held. * * There is no useful way of recovering from errors in this function, * not involving panicking the kernel. If the metadata cannot be written * the most we can do is notify the operator and hope he spots it and * replaces the broken drive. */ static void write_metadata(struct g_consumer *cp, struct g_virstor_metadata *md) { struct g_provider *pp; char *buf; int error; KASSERT(cp != NULL && md != NULL && cp->provider != NULL, ("Something's fishy in %s", __func__)); LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name); g_topology_assert(); error = g_access(cp, 0, 1, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d", cp->provider->name, error); return; } pp = cp->provider; buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK); bzero(buf, pp->sectorsize); virstor_metadata_encode(md, buf); g_topology_unlock(); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); g_topology_lock(); g_access(cp, 0, -1, 0); free(buf, M_GVIRSTOR); if (error != 0) LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s", error, cp->provider->name); } /* * Creates a new instance of this GEOM class, initialise softc */ static struct g_geom * create_virstor_geom(struct g_class *mp, struct g_virstor_metadata *md) { struct g_geom *gp; struct g_virstor_softc *sc; LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)", md->md_name, md->md_id); if (md->md_count < 1 || md->md_chunk_size < 1 || md->md_virsize < md->md_chunk_size) { /* This is bogus configuration, and probably means data is * somehow corrupted. Panic, maybe? */ LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s", md->md_name); return (NULL); } /* Check if it's already created */ LIST_FOREACH(gp, &mp->geom, geom) { sc = gp->softc; if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) { LOG_MSG(LVL_WARNING, "Geom %s already exists", md->md_name); if (sc->id != md->md_id) { LOG_MSG(LVL_ERROR, "Some stale or invalid components " "exist for virstor device named %s. " "You will need to all stale " "components and maybe reconfigure " "the virstor device. Tune " "kern.geom.virstor.debug sysctl up " "for more information.", sc->geom->name); } return (NULL); } } gp = g_new_geomf(mp, "%s", md->md_name); gp->softc = NULL; /* to circumevent races that test softc */ gp->start = g_virstor_start; gp->spoiled = g_virstor_orphan; gp->orphan = g_virstor_orphan; gp->access = g_virstor_access; gp->dumpconf = g_virstor_dumpconf; sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK | M_ZERO); sc->id = md->md_id; sc->n_components = md->md_count; sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count, M_GVIRSTOR, M_WAITOK | M_ZERO); sc->chunk_size = md->md_chunk_size; sc->virsize = md->md_virsize; STAILQ_INIT(&sc->delayed_bio_q); mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx", "gvirstor", MTX_DEF | MTX_RECURSE); sc->geom = gp; sc->provider = NULL; /* virstor_check_and_run will create it */ gp->softc = sc; LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name); return (gp); } /* * Add provider to a GEOM class instance */ static int add_provider_to_geom(struct g_virstor_softc *sc, struct g_provider *pp, struct g_virstor_metadata *md) { struct g_virstor_component *component; struct g_consumer *cp, *fcp; struct g_geom *gp; int error; if (md->no >= sc->n_components) return (EINVAL); /* "Current" compontent */ component = &(sc->components[md->no]); if (component->gcons != NULL) return (EEXIST); gp = sc->geom; fcp = LIST_FIRST(&gp->consumer); cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { g_destroy_consumer(cp); return (error); } if (fcp != NULL) { if (fcp->provider->sectorsize != pp->sectorsize) { /* TODO: this can be made to work */ LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid " "sector size (%d)", pp->name, sc->geom->name, pp->sectorsize); return (EINVAL); } if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) { /* Replicate access permissions from first "live" consumer * to the new one */ error = g_access(cp, fcp->acr, fcp->acw, fcp->ace); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } } } /* Bring up a new component */ cp->private = component; component->gcons = cp; component->sc = sc; component->index = md->no; component->chunk_count = md->chunk_count; component->chunk_next = md->chunk_next; component->chunk_reserved = md->chunk_reserved; component->flags = md->flags; LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name); virstor_check_and_run(sc); return (0); } /* * Check if everything's ready to create the geom provider & device entry, * create and start provider. * Called ultimately by .taste, from g_event thread */ static void virstor_check_and_run(struct g_virstor_softc *sc) { off_t off; size_t n, count; int index; int error; if (virstor_valid_components(sc) != sc->n_components) return; if (virstor_valid_components(sc) == 0) { /* This is actually a candidate for panic() */ LOG_MSG(LVL_ERROR, "No valid components for %s?", sc->provider->name); return; } sc->sectorsize = sc->components[0].gcons->provider->sectorsize; /* Initialise allocation map from the first consumer */ sc->chunk_count = sc->virsize / sc->chunk_size; if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) { LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes", sc->provider->name, sc->chunk_count * (off_t)sc->chunk_size); } sc->map_size = sc->chunk_count * sizeof *(sc->map); /* The following allocation is in order of 4MB - 8MB */ sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK); KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s", __func__, sc->map_size, sc->provider->name)); sc->map_sectors = sc->map_size / sc->sectorsize; count = 0; for (n = 0; n < sc->n_components; n++) count += sc->components[n].chunk_count; LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual " "(%zu KB chunks)", sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024); error = g_access(sc->components[0].gcons, 1, 0, 0); if (error != 0) { LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to " "read allocation map for %s", sc->components[0].gcons->provider->name, sc->geom->name); return; } /* Read in the allocation map */ LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name, sc->components[0].gcons->provider->name); off = count = n = 0; while (count < sc->map_size) { struct g_virstor_map_entry *mapbuf; size_t bs; bs = MIN(maxphys, sc->map_size - count); if (bs % sc->sectorsize != 0) { /* Check for alignment errors */ bs = rounddown(bs, sc->sectorsize); if (bs == 0) break; LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned " "for %s on %s", sc->geom->name, sc->components[0].gcons->provider->name); } mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error); if (mapbuf == NULL) { free(sc->map, M_GVIRSTOR); LOG_MSG(LVL_ERROR, "Error reading allocation map " "for %s from %s (offset %ju) (error %d)", sc->geom->name, sc->components[0].gcons->provider->name, off, error); return; } bcopy(mapbuf, &sc->map[n], bs); off += bs; count += bs; n += bs / sizeof *(sc->map); g_free(mapbuf); } g_access(sc->components[0].gcons, -1, 0, 0); LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name); /* find first component with allocatable chunks */ index = -1; for (n = 0; n < sc->n_components; n++) { if (sc->components[n].chunk_next < sc->components[n].chunk_count) { index = n; break; } } if (index == -1) /* not found? set it to the last component and handle it * later */ index = sc->n_components - 1; if (index >= sc->n_components - g_virstor_component_watermark - 1) { LOG_MSG(LVL_WARNING, "Device %s running out of components " "(%d/%u: %s)", sc->geom->name, index+1, sc->n_components, sc->components[index].gcons->provider->name); } sc->curr_component = index; if (sc->components[index].chunk_next >= sc->components[index].chunk_count - g_virstor_chunk_watermark) { LOG_MSG(LVL_WARNING, "Component %s of %s is running out of free space " "(%u chunks left)", sc->components[index].gcons->provider->name, sc->geom->name, sc->components[index].chunk_count - sc->components[index].chunk_next); } sc->me_per_sector = sc->sectorsize / sizeof *(sc->map); if (sc->sectorsize % sizeof *(sc->map) != 0) { LOG_MSG(LVL_ERROR, "%s: Map entries don't fit exactly in a sector (%s)", __func__, sc->geom->name); return; } /* Recalculate allocated chunks in components & at the same time * verify map data is sane. We could trust metadata on this, but * we want to make sure. */ for (n = 0; n < sc->n_components; n++) sc->components[n].chunk_next = sc->components[n].chunk_reserved; for (n = 0; n < sc->chunk_count; n++) { if (sc->map[n].provider_no >= sc->n_components || sc->map[n].provider_chunk >= sc->components[sc->map[n].provider_no].chunk_count) { LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s", __func__, (u_int)n, sc->geom->name); LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u" " provider_chunk: %u, chunk_count: %u", __func__, sc->map[n].provider_no, sc->n_components, sc->map[n].provider_chunk, sc->components[sc->map[n].provider_no].chunk_count); return; } if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED) sc->components[sc->map[n].provider_no].chunk_next++; } sc->provider = g_new_providerf(sc->geom, "virstor/%s", sc->geom->name); sc->provider->sectorsize = sc->sectorsize; sc->provider->mediasize = sc->virsize; g_error_provider(sc->provider, 0); LOG_MSG(LVL_INFO, "%s activated", sc->provider->name); LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting " "chunk %u", sc->provider->name, sc->curr_component, sc->components[sc->curr_component].chunk_next); } /* * Returns count of active providers in this geom instance */ static u_int virstor_valid_components(struct g_virstor_softc *sc) { unsigned int nc, i; nc = 0; KASSERT(sc != NULL, ("%s: softc is NULL", __func__)); KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__)); for (i = 0; i < sc->n_components; i++) if (sc->components[i].gcons != NULL) nc++; return (nc); } /* * Called when the consumer gets orphaned (?) */ static void g_virstor_orphan(struct g_consumer *cp) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct g_geom *gp; g_topology_assert(); gp = cp->geom; sc = gp->softc; if (sc == NULL) return; comp = cp->private; KASSERT(comp != NULL, ("%s: No component in private part of consumer", __func__)); remove_component(sc, comp, FALSE); if (LIST_EMPTY(&gp->consumer)) virstor_geom_destroy(sc, TRUE, FALSE); } /* * Called to notify geom when it's been opened, and for what intent */ static int g_virstor_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *c, *c2, *tmp; struct g_virstor_softc *sc; struct g_geom *gp; int error; KASSERT(pp != NULL, ("%s: NULL provider", __func__)); gp = pp->geom; KASSERT(gp != NULL, ("%s: NULL geom", __func__)); sc = gp->softc; /* Grab an exclusive bit to propagate on our consumers on first open */ if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) de++; /* ... drop it on close */ if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) { de--; if (sc != NULL) update_metadata(sc); } error = ENXIO; LIST_FOREACH_SAFE(c, &gp->consumer, consumer, tmp) { error = g_access(c, dr, dw, de); if (error != 0) goto fail; if (c->acr == 0 && c->acw == 0 && c->ace == 0 && c->flags & G_CF_ORPHAN) { g_detach(c); g_destroy_consumer(c); } } if (sc != NULL && LIST_EMPTY(&gp->consumer)) virstor_geom_destroy(sc, TRUE, FALSE); return (error); fail: /* Backout earlier changes */ LIST_FOREACH(c2, &gp->consumer, consumer) { if (c2 == c) break; g_access(c2, -dr, -dw, -de); } return (error); } /* * Generate XML dump of current state */ static void g_virstor_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_virstor_softc *sc; g_topology_assert(); sc = gp->softc; if (sc == NULL || pp != NULL) return; if (cp != NULL) { /* For each component */ struct g_virstor_component *comp; comp = cp->private; if (comp == NULL) return; sbuf_printf(sb, "%s%u\n", indent, comp->index); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_count); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_next); sbuf_printf(sb, "%s%u\n", indent, comp->chunk_reserved); sbuf_printf(sb, "%s%u%%\n", indent, comp->chunk_next > 0 ? 100 - ((comp->chunk_next + comp->chunk_reserved) * 100) / comp->chunk_count : 100); } else { /* For the whole thing */ u_int count, used, i; off_t size; count = used = size = 0; for (i = 0; i < sc->n_components; i++) { if (sc->components[i].gcons != NULL) { count += sc->components[i].chunk_count; used += sc->components[i].chunk_next + sc->components[i].chunk_reserved; size += sc->components[i].gcons-> provider->mediasize; } } sbuf_printf(sb, "%s" "Components=%u, Online=%u\n", indent, sc->n_components, virstor_valid_components(sc)); sbuf_printf(sb, "%s%u%% physical free\n", indent, 100-(used * 100) / count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_size); sbuf_printf(sb, "%s%u%%\n", indent, used > 0 ? 100 - (used * 100) / count : 100); sbuf_printf(sb, "%s%u\n", indent, count); sbuf_printf(sb, "%s%zu\n", indent, sc->chunk_count); sbuf_printf(sb, "%s%zu%%\n", indent, (count * 100) / sc->chunk_count); sbuf_printf(sb, "%s%jd\n", indent, size); sbuf_printf(sb, "%s%jd\n", indent, sc->virsize); } } /* * GEOM .done handler * Can't use standard handler because one requested IO may * fork into additional data IOs */ static void g_virstor_done(struct bio *b) { - struct g_virstor_softc *sc; struct bio *parent_b; parent_b = b->bio_parent; - sc = parent_b->bio_to->geom->softc; if (b->bio_error != 0) { LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s", b->bio_error, b->bio_offset, b->bio_length, b->bio_to->name); if (parent_b->bio_error == 0) parent_b->bio_error = b->bio_error; } parent_b->bio_inbed++; parent_b->bio_completed += b->bio_completed; if (parent_b->bio_children == parent_b->bio_inbed) { parent_b->bio_completed = parent_b->bio_length; g_io_deliver(parent_b, parent_b->bio_error); } g_destroy_bio(b); } /* * I/O starts here * Called in g_down thread */ static void g_virstor_start(struct bio *b) { struct g_virstor_softc *sc; struct g_virstor_component *comp; struct bio *cb; struct g_provider *pp; char *addr; off_t offset, length; struct bio_queue_head bq; size_t chunk_size; /* cached for convenience */ u_int count; pp = b->bio_to; sc = pp->geom->softc; KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__, b->bio_to->error, b->bio_to->name)); LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__); switch (b->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: g_io_deliver(b, EOPNOTSUPP); return; } LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length); bioq_init(&bq); chunk_size = sc->chunk_size; addr = b->bio_data; offset = b->bio_offset; /* virtual offset and length */ length = b->bio_length; while (length > 0) { size_t chunk_index, in_chunk_offset, in_chunk_length; struct virstor_map_entry *me; chunk_index = offset / chunk_size; /* round downwards */ in_chunk_offset = offset % chunk_size; in_chunk_length = min(length, chunk_size - in_chunk_offset); LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)", b->bio_cmd == BIO_READ ? "R" : "W", offset, length, chunk_index, in_chunk_offset, in_chunk_length); me = &sc->map[chunk_index]; if (b->bio_cmd == BIO_READ || b->bio_cmd == BIO_DELETE) { if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* Reads from unallocated chunks return zeroed * buffers */ if (b->bio_cmd == BIO_READ) bzero(addr, in_chunk_length); } else { comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk * (off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } } else { /* handle BIO_WRITE */ KASSERT(b->bio_cmd == BIO_WRITE, ("%s: Unknown command %d", __func__, b->bio_cmd)); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) { /* We have a virtual chunk, represented by * the "me" entry, but it's not yet allocated * (tied to) a physical chunk. So do it now. */ struct virstor_map_entry *data_me; u_int phys_chunk, comp_no; off_t s_offset; int error; error = allocate_chunk(sc, &comp, &comp_no, &phys_chunk); if (error != 0) { /* We cannot allocate a physical chunk * to satisfy this request, so we'll * delay it to when we can... * XXX: this will prevent the fs from * being umounted! */ struct g_virstor_bio_q *biq; biq = malloc(sizeof *biq, M_GVIRSTOR, M_NOWAIT); if (biq == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } biq->bio = b; mtx_lock(&sc->delayed_bio_q_mtx); STAILQ_INSERT_TAIL(&sc->delayed_bio_q, biq, linkage); mtx_unlock(&sc->delayed_bio_q_mtx); LOG_MSG(LVL_WARNING, "Delaying BIO " "(size=%ju) until free physical " "space can be found on %s", b->bio_length, sc->provider->name); return; } LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s " "for %s", phys_chunk, comp->gcons->provider->name, sc->provider->name); me->provider_no = comp_no; me->provider_chunk = phys_chunk; me->flags |= VIRSTOR_MAP_ALLOCATED; cb = g_clone_bio(b); if (cb == NULL) { me->flags &= ~VIRSTOR_MAP_ALLOCATED; me->provider_no = 0; me->provider_chunk = 0; bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* The allocation table is stored continuously * at the start of the drive. We need to * calculate the offset of the sector that holds * this map entry both on the drive and in the * map array. * sc_offset will end up pointing to the drive * sector. */ s_offset = chunk_index * sizeof *me; s_offset = rounddown(s_offset, sc->sectorsize); /* data_me points to map entry sector * in memory (analogous to offset) */ data_me = &sc->map[rounddown(chunk_index, sc->me_per_sector)]; /* Commit sector with map entry to storage */ cb->bio_to = sc->components[0].gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = s_offset; cb->bio_data = (char *)data_me; cb->bio_length = sc->sectorsize; cb->bio_caller1 = &sc->components[0]; bioq_disksort(&bq, cb); } comp = &sc->components[me->provider_no]; cb = g_clone_bio(b); if (cb == NULL) { bioq_dismantle(&bq); if (b->bio_error == 0) b->bio_error = ENOMEM; g_io_deliver(b, b->bio_error); return; } /* Finally, handle the data */ cb->bio_to = comp->gcons->provider; cb->bio_done = g_virstor_done; cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size + in_chunk_offset; cb->bio_length = in_chunk_length; cb->bio_data = addr; cb->bio_caller1 = comp; bioq_disksort(&bq, cb); } addr += in_chunk_length; length -= in_chunk_length; offset += in_chunk_length; } /* Fire off bio's here */ count = 0; for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) { bioq_remove(&bq, cb); LOG_REQ(LVL_MOREDEBUG, cb, "Firing request"); comp = cb->bio_caller1; cb->bio_caller1 = NULL; LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju", cb->bio_offset, cb->bio_length); g_io_request(cb, comp->gcons); count++; } if (count == 0) { /* We handled everything locally */ b->bio_completed = b->bio_length; g_io_deliver(b, 0); } } /* * Allocate a chunk from a physical provider. Returns physical component, * chunk index relative to the component and the component's index. */ static int allocate_chunk(struct g_virstor_softc *sc, struct g_virstor_component **comp, u_int *comp_no_p, u_int *chunk) { u_int comp_no; KASSERT(sc->curr_component < sc->n_components, ("%s: Invalid curr_component: %u", __func__, sc->curr_component)); comp_no = sc->curr_component; *comp = &sc->components[comp_no]; dump_component(*comp); if ((*comp)->chunk_next >= (*comp)->chunk_count) { /* This component is full. Allocate next component */ if (comp_no >= sc->n_components-1) { LOG_MSG(LVL_ERROR, "All physical space allocated for %s", sc->geom->name); return (-1); } (*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT; sc->curr_component = ++comp_no; *comp = &sc->components[comp_no]; if (comp_no >= sc->n_components - g_virstor_component_watermark-1) LOG_MSG(LVL_WARNING, "Device %s running out of components " "(switching to %u/%u: %s)", sc->geom->name, comp_no+1, sc->n_components, (*comp)->gcons->provider->name); /* Take care not to overwrite reserved chunks */ if ( (*comp)->chunk_reserved > 0 && (*comp)->chunk_next < (*comp)->chunk_reserved) (*comp)->chunk_next = (*comp)->chunk_reserved; (*comp)->flags |= VIRSTOR_PROVIDER_ALLOCATED | VIRSTOR_PROVIDER_CURRENT; dump_component(*comp); *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } else { *comp_no_p = comp_no; *chunk = (*comp)->chunk_next++; } return (0); } /* Dump a component */ static void dump_component(struct g_virstor_component *comp) { if (g_virstor_debug < LVL_DEBUG2) return; printf("Component %d: %s\n", comp->index, comp->gcons->provider->name); printf(" chunk_count: %u\n", comp->chunk_count); printf(" chunk_next: %u\n", comp->chunk_next); printf(" flags: %u\n", comp->flags); } #if 0 /* Dump a map entry */ static void dump_me(struct virstor_map_entry *me, unsigned int nr) { if (g_virstor_debug < LVL_DEBUG) return; printf("VIRT. CHUNK #%d: ", nr); if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) printf("(unallocated)\n"); else printf("allocated at provider %u, provider_chunk %u\n", me->provider_no, me->provider_chunk); } #endif /* * Dismantle bio_queue and destroy its components */ static void bioq_dismantle(struct bio_queue_head *bq) { struct bio *b; for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) { bioq_remove(bq, b); g_destroy_bio(b); } } /* * The function that shouldn't be called. * When this is called, the stack is already garbled because of * argument mismatch. There's nothing to do now but panic, which is * accidentally the whole purpose of this function. * Motivation: to guard from accidentally calling geom methods when * they shouldn't be called. (see g_..._taste) */ static void invalid_call(void) { panic("invalid_call() has just been called. Something's fishy here."); } DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */ MODULE_VERSION(geom_virstor, 0);