Index: head/sys/geom/sched/g_sched.c =================================================================== --- head/sys/geom/sched/g_sched.c (revision 296605) +++ head/sys/geom/sched/g_sched.c (revision 296606) @@ -1,1725 +1,1725 @@ /*- * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * Main control module for geom-based disk schedulers ('sched'). * * USER VIEW * A 'sched' node is typically inserted transparently between * an existing provider pp and its original geom gp * * [pp --> gp ..] * * using the command "geom sched insert " and * resulting in the following topology * * [pp --> sched_gp --> cp] [new_pp --> gp ... ] * * Deletion "geom sched destroy .sched." restores the * original chain. The normal "geom sched create " * is also supported. * * INTERNALS * Internally, the 'sched' uses the following data structures * * geom{} g_sched_softc{} g_gsched{} * +----------+ +---------------+ +-------------+ * | softc *-|--->| sc_gsched *-|-->| gs_init | * | ... | | | | gs_fini | * | | | [ hash table] | | gs_start | * +----------+ | | | ... | * | | +-------------+ * | | * | | g_*_softc{} * | | +-------------+ * | sc_data *-|-->| | * +---------------+ | algorithm- | * | specific | * +-------------+ * * A g_sched_softc{} is created with a "geom sched insert" call. * In turn this instantiates a specific scheduling algorithm, * which sets sc_gsched to point to the algorithm callbacks, * and calls gs_init() to create the g_*_softc{} . * The other callbacks (gs_start, gs_next, ...) are invoked * as needed * * g_sched_softc{} is defined in g_sched.h and mostly used here; * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) * * DATA MOVING * When a bio is received on the provider, it goes to the * g_sched_start() which calls gs_start() to initially queue it; * then we call g_sched_dispatch() that loops around gs_next() * to select zero or more bio's to be sent downstream. * * g_sched_dispatch() can also be called as a result of a timeout, * e.g. when doing anticipation or pacing requests. * * When a bio comes back, it goes to g_sched_done() which in turn * calls gs_done(). The latter does any necessary housekeeping in * the scheduling algorithm, and may decide to call g_sched_dispatch() * to send more bio's downstream. * * If an algorithm needs per-flow queues, these are created * calling gs_init_class() and destroyed with gs_fini_class(), * and they are also inserted in the hash table implemented in * the g_sched_softc{} * * If an algorithm is replaced, or a transparently-inserted node is * removed with "geom sched destroy", we need to remove all references * to the g_*_softc{} and g_sched_softc from the bio's still in * the scheduler. g_sched_forced_dispatch() helps doing this. * XXX need to explain better. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* we access curthread */ #include #include "gs_scheduler.h" #include "g_sched.h" /* geom hooks */ /* * Size of the per-geom hash table storing traffic classes. * We may decide to change it at a later time, it has no ABI * implications as it is only used for run-time allocations. */ #define G_SCHED_HASH_SIZE 32 static int g_sched_destroy(struct g_geom *gp, boolean_t force); static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb); static struct g_geom *g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused); static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_sched_init(struct g_class *mp); static void g_sched_fini(struct g_class *mp); static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td); struct g_class g_sched_class = { .name = G_SCHED_CLASS_NAME, .version = G_VERSION, .ctlreq = g_sched_config, .taste = g_sched_taste, .destroy_geom = g_sched_destroy_geom, .init = g_sched_init, .ioctl = g_sched_ioctl, .fini = g_sched_fini }; MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); /* * Global variables describing the state of the geom_sched module. * There is only one static instance of this structure. */ LIST_HEAD(gs_list, g_gsched); /* type, link field */ struct geom_sched_vars { struct mtx gs_mtx; struct gs_list gs_scheds; /* list of algorithms */ u_int gs_debug; u_int gs_sched_count; /* how many algorithms ? */ u_int gs_patched; /* g_io_request was patched */ u_int gs_initialized; u_int gs_expire_secs; /* expiration of hash entries */ struct bio_queue_head gs_pending; u_int gs_npending; /* The following are for stats, usually protected by gs_mtx. */ u_long gs_requests; /* total requests */ u_long gs_done; /* total done */ u_int gs_in_flight; /* requests in flight */ u_int gs_writes_in_flight; u_int gs_bytes_in_flight; u_int gs_write_bytes_in_flight; char gs_names[256]; /* names of schedulers */ }; static struct geom_sched_vars me = { .gs_expire_secs = 10, }; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, "GEOM_SCHED stuff"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, &me.gs_bytes_in_flight, 0, "Bytes in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, &me.gs_writes_in_flight, 0, "Write Requests in flight"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, &me.gs_in_flight, 0, "Requests in flight"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, &me.gs_done, 0, "Total done"); SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, &me.gs_requests, 0, "Total requests"); SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, &me.gs_names, 0, "Algorithm names"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, &me.gs_sched_count, 0, "Number of algorithms"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, &me.gs_debug, 0, "Debug level"); SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, &me.gs_expire_secs, 0, "Expire time in seconds"); /* * g_sched calls the scheduler algorithms with this lock held. * The locking functions are exposed so the scheduler algorithms can also * protect themselves e.g. when running a callout handler. */ void g_sched_lock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_lock(&sc->sc_mtx); } void g_sched_unlock(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; mtx_unlock(&sc->sc_mtx); } /* * Support functions to handle references to the module, * which are coming from devices using this scheduler. */ static inline void g_gsched_ref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, 1); } static inline void g_gsched_unref(struct g_gsched *gsp) { atomic_add_int(&gsp->gs_refs, -1); } /* * Update the stats when this request is done. */ static void g_sched_update_stats(struct bio *bio) { me.gs_done++; me.gs_in_flight--; me.gs_bytes_in_flight -= bio->bio_length; - if (bio->bio_cmd & BIO_WRITE) { + if (bio->bio_cmd == BIO_WRITE) { me.gs_writes_in_flight--; me.gs_write_bytes_in_flight -= bio->bio_length; } } /* * Dispatch any pending request. */ static void g_sched_forced_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during forced dispatch")); while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * The main dispatch loop, called either here after the start * routine, or by scheduling algorithms when they receive a timeout * or a 'done' notification. Does not share code with the forced * dispatch path, since the gs_done() callback can call us. */ void g_sched_dispatch(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; struct bio *bp; KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); if ((sc->sc_flags & G_SCHED_FLUSHING)) return; while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) g_io_request(bp, LIST_FIRST(&gp->consumer)); } /* * Recent (8.0 and above) versions of FreeBSD have support to * register classifiers of disk requests. The classifier is * invoked by g_io_request(), and stores the information into * bp->bio_classifier1. * * Support for older versions, which is left here only for * documentation purposes, relies on two hacks: * 1. classification info is written into the bio_caller1 * field of the topmost node in the bio chain. This field * is rarely used, but this module is incompatible with * those that use bio_caller1 for other purposes, * such as ZFS and gjournal; * 2. g_io_request() is patched in-memory when the module is * loaded, so that the function calls a classifier as its * first thing. g_io_request() is restored when the module * is unloaded. This functionality is only supported for * x86 and amd64, other architectures need source code changes. */ /* * Lookup the identity of the issuer of the original request. * In the current implementation we use the curthread of the * issuer, but different mechanisms may be implemented later * so we do not make assumptions on the return value which for * us is just an opaque identifier. */ static inline u_long g_sched_classify(struct bio *bp) { /* we have classifier fields in the struct bio */ return ((u_long)bp->bio_classifier1); } /* Return the hash chain for the given key. */ static inline struct g_hash * g_sched_hash(struct g_sched_softc *sc, u_long key) { return (&sc->sc_hash[key & sc->sc_mask]); } /* * Helper function for the children classes, which takes * a geom and a bio and returns the private descriptor * associated to the request. This involves fetching * the classification field and [al]locating the * corresponding entry in the hash table. */ void * g_sched_get_class(struct g_geom *gp, struct bio *bp) { struct g_sched_softc *sc; struct g_sched_class *gsc; struct g_gsched *gsp; struct g_hash *bucket; u_long key; sc = gp->softc; key = g_sched_classify(bp); bucket = g_sched_hash(sc, key); LIST_FOREACH(gsc, bucket, gsc_clist) { if (key == gsc->gsc_key) { gsc->gsc_refs++; return (gsc->gsc_priv); } } gsp = sc->sc_gsched; gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, M_GEOM_SCHED, M_NOWAIT | M_ZERO); if (!gsc) return (NULL); if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { free(gsc, M_GEOM_SCHED); return (NULL); } gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ gsc->gsc_key = key; LIST_INSERT_HEAD(bucket, gsc, gsc_clist); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; return (gsc->gsc_priv); } /* * Release a reference to the per-client descriptor, */ void g_sched_put_class(struct g_geom *gp, void *priv) { struct g_sched_class *gsc; struct g_sched_softc *sc; gsc = g_sched_priv2class(priv); gsc->gsc_expire = ticks + me.gs_expire_secs * hz; if (--gsc->gsc_refs > 0) return; sc = gp->softc; sc->sc_gsched->gs_fini_class(sc->sc_data, priv); LIST_REMOVE(gsc, gsc_clist); free(gsc, M_GEOM_SCHED); } static void g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, struct g_gsched *gsp, void *data) { struct g_sched_class *cp, *cp2; int i; if (!hp) return; if (data && gsp->gs_hash_unref) gsp->gs_hash_unref(data); for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) g_sched_put_class(gp, cp->gsc_priv); } hashdestroy(hp, M_GEOM_SCHED, mask); } static struct g_hash * g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) { struct g_hash *hash; if (gsp->gs_priv_size == 0) return (NULL); hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); return (hash); } static void g_sched_flush_classes(struct g_geom *gp) { struct g_sched_softc *sc; struct g_sched_class *cp, *cp2; int i; sc = gp->softc; if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) return; for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) g_sched_put_class(gp, cp->gsc_priv); } } sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; } /* * Wait for the completion of any outstanding request. To ensure * that this does not take forever the caller has to make sure that * no new request enter the scehduler before calling us. * * Must be called with the gp mutex held and topology locked. */ static int g_sched_wait_pending(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; int endticks = ticks + hz; g_topology_assert(); while (sc->sc_pending && endticks - ticks >= 0) msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); return (sc->sc_pending ? ETIMEDOUT : 0); } static int g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) { struct g_sched_softc *sc = gp->softc; int error; /* Set the flushing flag: new bios will not enter the scheduler. */ sc->sc_flags |= G_SCHED_FLUSHING; g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) goto failed; /* No more requests pending or in flight from the old gsp. */ g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; /* * Avoid deadlock here by releasing the gp mutex and reacquiring * it once done. It should be safe, since no reconfiguration or * destruction can take place due to the geom topology lock; no * new request can use the current sc_data since we flagged the * geom as being flushed. */ g_sched_unlock(gp); gsp->gs_fini(sc->sc_data); g_sched_lock(gp); sc->sc_gsched = NULL; sc->sc_data = NULL; g_gsched_unref(gsp); failed: sc->sc_flags &= ~G_SCHED_FLUSHING; return (error); } static int g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) { int error; g_sched_lock(gp); error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ g_sched_unlock(gp); return (error); } /* * Support function for create/taste -- locate the desired * algorithm and grab a reference to it. */ static struct g_gsched * g_gsched_find(const char *name) { struct g_gsched *gsp = NULL; mtx_lock(&me.gs_mtx); LIST_FOREACH(gsp, &me.gs_scheds, glist) { if (strcmp(name, gsp->gs_name) == 0) { g_gsched_ref(gsp); break; } } mtx_unlock(&me.gs_mtx); return (gsp); } /* * Rebuild the list of scheduler names. * To be called with me.gs_mtx lock held. */ static void g_gsched_build_names(struct g_gsched *gsp) { int pos, l; struct g_gsched *cur; pos = 0; LIST_FOREACH(cur, &me.gs_scheds, glist) { l = strlen(cur->gs_name); if (l + pos + 1 + 1 < sizeof(me.gs_names)) { if (pos != 0) me.gs_names[pos++] = ' '; strcpy(me.gs_names + pos, cur->gs_name); pos += l; } } me.gs_names[pos] = '\0'; } /* * Register or unregister individual scheduling algorithms. */ static int g_gsched_register(struct g_gsched *gsp) { struct g_gsched *cur; int error = 0; mtx_lock(&me.gs_mtx); LIST_FOREACH(cur, &me.gs_scheds, glist) { if (strcmp(gsp->gs_name, cur->gs_name) == 0) break; } if (cur != NULL) { G_SCHED_DEBUG(0, "A scheduler named %s already" "exists.", gsp->gs_name); error = EEXIST; } else { LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); gsp->gs_refs = 1; me.gs_sched_count++; g_gsched_build_names(gsp); } mtx_unlock(&me.gs_mtx); return (error); } struct g_gsched_unregparm { struct g_gsched *gup_gsp; int gup_error; }; static void g_gsched_unregister(void *arg, int flag) { struct g_gsched_unregparm *parm = arg; struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; struct g_sched_softc *sc; struct g_geom *gp, *gp_tmp; int error; parm->gup_error = 0; g_topology_assert(); if (flag == EV_CANCEL) return; mtx_lock(&me.gs_mtx); LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { if (gp->class != &g_sched_class) continue; /* Should not happen. */ sc = gp->softc; if (sc->sc_gsched == gsp) { error = g_sched_remove(gp, gsp); if (error) goto failed; } } LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { if (cur != gsp) continue; if (gsp->gs_refs != 1) { G_SCHED_DEBUG(0, "%s still in use.", gsp->gs_name); parm->gup_error = EBUSY; } else { LIST_REMOVE(gsp, glist); me.gs_sched_count--; g_gsched_build_names(gsp); } break; } if (cur == NULL) { G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); parm->gup_error = ENOENT; } failed: mtx_unlock(&me.gs_mtx); } static inline void g_gsched_global_init(void) { if (!me.gs_initialized) { G_SCHED_DEBUG(0, "Initializing global data."); mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); LIST_INIT(&me.gs_scheds); bioq_init(&me.gs_pending); me.gs_initialized = 1; } } /* * Module event called when a scheduling algorithm module is loaded or * unloaded. */ int g_gsched_modevent(module_t mod, int cmd, void *arg) { struct g_gsched *gsp = arg; struct g_gsched_unregparm parm; int error; G_SCHED_DEBUG(0, "Modevent %d.", cmd); /* * If the module is loaded at boot, the geom thread that calls * g_sched_init() might actually run after g_gsched_modevent(), * so make sure that the module is properly initialized. */ g_gsched_global_init(); error = EOPNOTSUPP; switch (cmd) { case MOD_LOAD: error = g_gsched_register(gsp); G_SCHED_DEBUG(0, "Loaded module %s error %d.", gsp->gs_name, error); if (error == 0) g_retaste(&g_sched_class); break; case MOD_UNLOAD: parm.gup_gsp = gsp; parm.gup_error = 0; error = g_waitfor_event(g_gsched_unregister, &parm, M_WAITOK, NULL); if (error == 0) error = parm.gup_error; G_SCHED_DEBUG(0, "Unloaded module %s error %d.", gsp->gs_name, error); break; }; return (error); } #ifdef KTR #define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) static inline char g_sched_type(struct bio *bp) { - if (0 != (bp->bio_cmd & BIO_READ)) + if (bp->bio_cmd == BIO_READ) return ('R'); - else if (0 != (bp->bio_cmd & BIO_WRITE)) + else if (bp->bio_cmd == BIO_WRITE) return ('W'); return ('U'); } static inline void g_sched_trace_bio_START(struct bio *bp) { CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } static inline void g_sched_trace_bio_DONE(struct bio *bp) { CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), g_sched_type(bp), bp->bio_offset / ULONG_MAX, bp->bio_offset, bp->bio_length); } #else /* !KTR */ #define TRC_BIO_EVENT(e, bp) #endif /* !KTR */ /* * g_sched_done() and g_sched_start() dispatch the geom requests to * the scheduling algorithm in use. */ static void g_sched_done(struct bio *bio) { struct g_geom *gp = bio->bio_caller2; struct g_sched_softc *sc = gp->softc; TRC_BIO_EVENT(DONE, bio); KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); g_sched_lock(gp); g_sched_update_stats(bio); sc->sc_gsched->gs_done(sc->sc_data, bio); if (!--sc->sc_pending) wakeup(gp); g_sched_flush_classes(gp); g_sched_unlock(gp); g_std_done(bio); } static void g_sched_start(struct bio *bp) { struct g_geom *gp = bp->bio_to->geom; struct g_sched_softc *sc = gp->softc; struct bio *cbp; TRC_BIO_EVENT(START, bp); G_SCHED_LOGREQ(bp, "Request received."); cbp = g_clone_bio(bp); if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } cbp->bio_done = g_sched_done; cbp->bio_to = LIST_FIRST(&gp->provider); KASSERT(cbp->bio_to != NULL, ("NULL provider")); /* We only schedule reads and writes. */ - if (0 == (bp->bio_cmd & (BIO_READ | BIO_WRITE))) + if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE) goto bypass; G_SCHED_LOGREQ(cbp, "Sending request."); g_sched_lock(gp); /* * Call the algorithm's gs_start to queue the request in the * scheduler. If gs_start fails then pass the request down, * otherwise call g_sched_dispatch() which tries to push * one or more requests down. */ if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || sc->sc_gsched->gs_start(sc->sc_data, cbp)) { g_sched_unlock(gp); goto bypass; } /* * We use bio_caller1 to mark requests that are scheduled * so make sure it is not NULL. */ if (cbp->bio_caller1 == NULL) cbp->bio_caller1 = &me; /* anything not NULL */ cbp->bio_caller2 = gp; sc->sc_pending++; /* Update general stats. */ me.gs_in_flight++; me.gs_requests++; me.gs_bytes_in_flight += bp->bio_length; - if (bp->bio_cmd & BIO_WRITE) { + if (bp->bio_cmd == BIO_WRITE) { me.gs_writes_in_flight++; me.gs_write_bytes_in_flight += bp->bio_length; } g_sched_dispatch(gp); g_sched_unlock(gp); return; bypass: cbp->bio_done = g_std_done; cbp->bio_caller1 = NULL; /* not scheduled */ g_io_request(cbp, LIST_FIRST(&gp->consumer)); } /* * The next few functions are the geom glue. */ static void g_sched_orphan(struct g_consumer *cp) { g_topology_assert(); g_sched_destroy(cp->geom, 1); } static int g_sched_access(struct g_provider *pp, int dr, int dw, int de) { struct g_geom *gp; struct g_consumer *cp; int error; gp = pp->geom; cp = LIST_FIRST(&gp->consumer); error = g_access(cp, dr, dw, de); return (error); } static void g_sched_temporary_start(struct bio *bio) { mtx_lock(&me.gs_mtx); me.gs_npending++; bioq_disksort(&me.gs_pending, bio); mtx_unlock(&me.gs_mtx); } static void g_sched_flush_pending(g_start_t *start) { struct bio *bp; while ((bp = bioq_takefirst(&me.gs_pending))) start(bp); } static int g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) { struct g_sched_softc *sc = gp->softc; g_start_t *saved_start, *flush = g_sched_start; int error = 0, endticks = ticks + hz; g_cancel_event(newpp); /* prevent taste() */ /* copy private fields */ newpp->private = pp->private; newpp->index = pp->index; /* Queue all the early requests coming for us. */ me.gs_npending = 0; saved_start = pp->geom->start; dstgp->start = g_sched_temporary_start; while (pp->nstart - pp->nend != me.gs_npending && endticks - ticks >= 0) tsleep(pp, PRIBIO, "-", hz/10); if (pp->nstart - pp->nend != me.gs_npending) { flush = saved_start; error = ETIMEDOUT; goto fail; } /* link pp to this geom */ LIST_REMOVE(pp, provider); pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); /* * replicate the counts from the parent in the * new provider and consumer nodes */ cp->acr = newpp->acr = pp->acr; cp->acw = newpp->acw = pp->acw; cp->ace = newpp->ace = pp->ace; sc->sc_flags |= G_SCHED_PROXYING; fail: dstgp->start = saved_start; g_sched_flush_pending(flush); return (error); } /* * Create a geom node for the device passed as *pp. * If successful, add a reference to this gsp. */ static int g_sched_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp, int proxy) { struct g_sched_softc *sc = NULL; struct g_geom *gp, *dstgp; struct g_provider *newpp = NULL; struct g_consumer *cp = NULL; char name[64]; int error; g_topology_assert(); snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) { gctl_error(req, "Geom %s already exists.", name); return (EEXIST); } } gp = g_new_geomf(mp, "%s", name); dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); sc->sc_gsched = gsp; sc->sc_data = gsp->gs_init(gp); if (sc->sc_data == NULL) { error = ENOMEM; goto fail; } sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); /* * Do not initialize the flush mechanism, will be initialized * on the first insertion on the hash table. */ mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); gp->softc = sc; gp->start = g_sched_start; gp->orphan = g_sched_orphan; gp->access = g_sched_access; gp->dumpconf = g_sched_dumpconf; newpp = g_new_providerf(dstgp, "%s", gp->name); newpp->mediasize = pp->mediasize; newpp->sectorsize = pp->sectorsize; cp = g_new_consumer(gp); error = g_attach(cp, proxy ? newpp : pp); if (error != 0) { gctl_error(req, "Cannot attach to provider %s.", pp->name); goto fail; } g_error_provider(newpp, 0); if (proxy) { error = g_insert_proxy(gp, newpp, dstgp, pp, cp); if (error) goto fail; } G_SCHED_DEBUG(0, "Device %s created.", gp->name); g_gsched_ref(gsp); return (0); fail: if (cp != NULL) { if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (newpp != NULL) g_destroy_provider(newpp); if (sc->sc_hash) g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); if (sc->sc_data) gsp->gs_fini(sc->sc_data); g_free(gp->softc); g_destroy_geom(gp); return (error); } /* * Support for dynamic switching of scheduling algorithms. * First initialize the data structures for the new algorithm, * then call g_sched_remove_locked() to flush all references * to the old one, finally link the new algorithm. */ static int g_sched_change_algo(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp) { struct g_sched_softc *sc; struct g_geom *gp; struct g_hash *newh; void *data; u_long mask; int error = 0; gp = pp->geom; sc = gp->softc; data = gsp->gs_init(gp); if (data == NULL) return (ENOMEM); newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); if (gsp->gs_priv_size && !newh) { error = ENOMEM; goto fail; } g_sched_lock(gp); if (sc->sc_gsched) { /* can be NULL in some cases */ error = g_sched_remove_locked(gp, sc->sc_gsched); if (error) goto fail; } g_gsched_ref(gsp); sc->sc_gsched = gsp; sc->sc_data = data; sc->sc_hash = newh; sc->sc_mask = mask; g_sched_unlock(gp); return (0); fail: if (newh) g_sched_hash_fini(gp, newh, mask, gsp, data); if (data) gsp->gs_fini(data); g_sched_unlock(gp); return (error); } /* * Stop the request flow directed to the proxy, redirecting the new * requests to the me.gs_pending queue. */ static struct g_provider * g_detach_proxy(struct g_geom *gp) { struct g_consumer *cp; struct g_provider *pp, *newpp; do { pp = LIST_FIRST(&gp->provider); if (pp == NULL) break; cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; me.gs_npending = 0; pp->geom->start = g_sched_temporary_start; return (pp); } while (0); printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); return (NULL); } static void g_sched_blackhole(struct bio *bp) { g_io_deliver(bp, ENXIO); } static inline void g_reparent_provider(struct g_provider *pp, struct g_geom *gp, struct g_provider *newpp) { LIST_REMOVE(pp, provider); if (newpp) { pp->private = newpp->private; pp->index = newpp->index; } pp->geom = gp; LIST_INSERT_HEAD(&gp->provider, pp, provider); } static inline void g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) { struct g_geom *gp = oldpp->geom; g_reparent_provider(oldpp, newpp->geom, newpp); /* * Hackish: let the system destroy the old provider for us, just * in case someone attached a consumer to it, in which case a * direct call to g_destroy_provider() would not work. */ g_reparent_provider(newpp, gp, NULL); } /* * Complete the proxy destruction, linking the old provider to its * original geom, and destroying the proxy provider. Also take care * of issuing the pending requests collected in me.gs_pending (if any). */ static int g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) { struct g_consumer *cp; struct g_provider *newpp; do { cp = LIST_FIRST(&gp->consumer); if (cp == NULL) break; newpp = cp->provider; if (newpp == NULL) break; /* Relink the provider to its original geom. */ g_unproxy_provider(oldpp, newpp); /* Detach consumer from provider, and destroy provider. */ cp->acr = newpp->acr = 0; cp->acw = newpp->acw = 0; cp->ace = newpp->ace = 0; g_detach(cp); /* Send the pending bios through the right start function. */ g_sched_flush_pending(oldpp->geom->start); return (0); } while (0); printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); /* We cannot send the pending bios anywhere... */ g_sched_flush_pending(g_sched_blackhole); return (EINVAL); } static int g_sched_destroy(struct g_geom *gp, boolean_t force) { struct g_provider *pp, *oldpp = NULL; struct g_sched_softc *sc; struct g_gsched *gsp; int error; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); if (!(sc->sc_flags & G_SCHED_PROXYING)) { pp = LIST_FIRST(&gp->provider); if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { const char *msg = force ? "but we force removal" : "cannot remove"; G_SCHED_DEBUG(!force, "Device %s is still open (r%dw%de%d), %s.", pp->name, pp->acr, pp->acw, pp->ace, msg); if (!force) return (EBUSY); } else { G_SCHED_DEBUG(0, "Device %s removed.", gp->name); } } else oldpp = g_detach_proxy(gp); gsp = sc->sc_gsched; if (gsp) { /* * XXX bad hack here: force a dispatch to release * any reference to the hash table still held by * the scheduler. */ g_sched_lock(gp); /* * We are dying here, no new requests should enter * the scheduler. This is granted by the topolgy, * either in case we were proxying (new bios are * being redirected) or not (see the access check * above). */ g_sched_forced_dispatch(gp); error = g_sched_wait_pending(gp); if (error) { /* * Not all the requests came home: this might happen * under heavy load, or if we were waiting for any * bio which is served in the event path (see * geom_slice.c for an example of how this can * happen). Try to restore a working configuration * if we can fail. */ if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { g_sched_flush_pending(force ? g_sched_blackhole : g_sched_start); } /* * In the forced destroy case there is not so much * we can do, we have pending bios that will call * g_sched_done() somehow, and we don't want them * to crash the system using freed memory. We tell * the user that something went wrong, and leak some * memory here. * Note: the callers using force = 1 ignore the * return value. */ if (force) { G_SCHED_DEBUG(0, "Pending requests while " " destroying geom, some memory leaked."); } return (error); } g_sched_unlock(gp); g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; gsp->gs_fini(sc->sc_data); g_gsched_unref(gsp); sc->sc_gsched = NULL; } if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { error = g_destroy_proxy(gp, oldpp); if (error) { if (force) { G_SCHED_DEBUG(0, "Unrecoverable error while " "destroying a proxy geom, leaking some " " memory."); } return (error); } } mtx_destroy(&sc->sc_mtx); g_free(gp->softc); gp->softc = NULL; g_wither_geom(gp, ENXIO); return (error); } static int g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { return (g_sched_destroy(gp, 0)); } /* * Functions related to the classification of requests. * * On recent FreeBSD versions (8.0 and above), we store a reference * to the issuer of a request in bp->bio_classifier1 as soon * as the bio is posted to the geom queue (and not later, because * requests are managed by the g_down thread afterwards). */ /* * Classifier support for recent FreeBSD versions: we use * a very simple classifier, only use curthread to tag a request. * The classifier is registered at module load, and unregistered * at module unload. */ static int g_sched_tag(void *arg, struct bio *bp) { bp->bio_classifier1 = curthread; return (1); } static struct g_classifier_hook g_sched_classifier = { .func = g_sched_tag, }; static inline void g_classifier_ini(void) { g_register_classifier(&g_sched_classifier); } static inline void g_classifier_fini(void) { g_unregister_classifier(&g_sched_classifier); } static void g_sched_init(struct g_class *mp) { g_gsched_global_init(); G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", mp, &g_sched_class); /* Patch g_io_request to store classification info in the bio. */ g_classifier_ini(); } static void g_sched_fini(struct g_class *mp) { g_classifier_fini(); G_SCHED_DEBUG(0, "Unloading..."); KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); mtx_destroy(&me.gs_mtx); } static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_geom *gp; cp = LIST_FIRST(&pp->geom->consumer); if (cp == NULL) return (ENOIOCTL); gp = cp->provider->geom; if (gp->ioctl == NULL) return (ENOIOCTL); return (gp->ioctl(cp->provider, cmd, data, fflag, td)); } /* * Read the i-th argument for a request, skipping the /dev/ * prefix if present. */ static const char * g_sched_argi(struct gctl_req *req, int i) { static const char *dev_prefix = "/dev/"; const char *name; char param[16]; int l = strlen(dev_prefix); snprintf(param, sizeof(param), "arg%d", i); name = gctl_get_asciiparam(req, param); if (name == NULL) gctl_error(req, "No 'arg%d' argument", i); else if (strncmp(name, dev_prefix, l) == 0) name += l; return (name); } /* * Fetch nargs and do appropriate checks. */ static int g_sched_get_nargs(struct gctl_req *req) { int *nargs; nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); if (nargs == NULL) { gctl_error(req, "No 'nargs' argument"); return (0); } if (*nargs <= 0) gctl_error(req, "Missing device(s)."); return (*nargs); } /* * Check whether we should add the class on certain volumes when * this geom is created. Right now this is under control of a kenv * variable containing the names of all devices that we care about. * Probably we should only support transparent insertion as the * preferred mode of operation. */ static struct g_geom * g_sched_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_gsched *gsp = NULL; /* the . algorithm we want */ const char *s; /* generic string pointer */ const char *taste_names; /* devices we like */ int l; g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); g_topology_assert(); G_SCHED_DEBUG(2, "Tasting %s.", pp->name); do { /* do not taste on ourselves */ if (pp->geom->class == mp) break; taste_names = kern_getenv("geom.sched.taste"); if (taste_names == NULL) break; l = strlen(pp->name); for (s = taste_names; *s && (s = strstr(s, pp->name)); s++) { /* further checks for an exact match */ if ( (s == taste_names || s[-1] == ' ') && (s[l] == '\0' || s[l] == ' ') ) break; } if (s == NULL) break; G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", pp->name, s); /* look up the provider name in the list */ s = kern_getenv("geom.sched.algo"); if (s == NULL) s = "rr"; gsp = g_gsched_find(s); /* also get a reference */ if (gsp == NULL) { G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); break; } /* XXX create with 1 as last argument ? */ g_sched_create(NULL, mp, pp, gsp, 0); g_gsched_unref(gsp); } while (0); return NULL; } static void g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_create(req, mp, pp, gsp, proxy) != 0) break; } g_gsched_unref(gsp); } static void g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) { struct g_provider *pp; struct g_gsched *gsp; const char *name; int i, nargs; g_topology_assert(); name = gctl_get_asciiparam(req, "algo"); if (name == NULL) { gctl_error(req, "No '%s' argument", "algo"); return; } gsp = g_gsched_find(name); /* also get a reference */ if (gsp == NULL) { gctl_error(req, "Bad algorithm '%s'", name); return; } nargs = g_sched_get_nargs(req); /* * Run on the arguments, and break on any error. * We look for a device name, but skip the /dev/ prefix if any. */ for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; pp = g_provider_by_name(name); if (pp == NULL || pp->geom->class != mp) { G_SCHED_DEBUG(1, "Provider %s is invalid.", name); gctl_error(req, "Provider %s is invalid.", name); break; } if (g_sched_change_algo(req, mp, pp, gsp) != 0) break; } g_gsched_unref(gsp); } static struct g_geom * g_sched_find_geom(struct g_class *mp, const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &mp->geom, geom) { if (strcmp(gp->name, name) == 0) return (gp); } return (NULL); } static void g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) { int nargs, *force, error, i; struct g_geom *gp; const char *name; g_topology_assert(); nargs = g_sched_get_nargs(req); force = gctl_get_paraml(req, "force", sizeof(*force)); if (force == NULL) { gctl_error(req, "No 'force' argument"); return; } for (i = 0; i < nargs; i++) { name = g_sched_argi(req, i); if (name == NULL) break; gp = g_sched_find_geom(mp, name); if (gp == NULL) { G_SCHED_DEBUG(1, "Device %s is invalid.", name); gctl_error(req, "Device %s is invalid.", name); break; } error = g_sched_destroy(gp, *force); if (error != 0) { gctl_error(req, "Cannot destroy device %s (error=%d).", gp->name, error); break; } } } static void g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) { uint32_t *version; g_topology_assert(); version = gctl_get_paraml(req, "version", sizeof(*version)); if (version == NULL) { gctl_error(req, "No '%s' argument.", "version"); return; } if (*version != G_SCHED_VERSION) { gctl_error(req, "Userland and kernel parts are " "out of sync."); return; } if (strcmp(verb, "create") == 0) { g_sched_ctl_create(req, mp, 0); return; } else if (strcmp(verb, "insert") == 0) { g_sched_ctl_create(req, mp, 1); return; } else if (strcmp(verb, "configure") == 0) { g_sched_ctl_configure(req, mp); return; } else if (strcmp(verb, "destroy") == 0) { g_sched_ctl_destroy(req, mp); return; } gctl_error(req, "Unknown verb."); } static void g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_sched_softc *sc = gp->softc; struct g_gsched *gsp = sc->sc_gsched; if (indent == NULL) { /* plaintext */ sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); } if (gsp != NULL && gsp->gs_dumpconf) gsp->gs_dumpconf(sb, indent, gp, cp, pp); } DECLARE_GEOM_CLASS(g_sched_class, g_sched); MODULE_VERSION(geom_sched, 0); Index: head/sys/geom/sched/gs_rr.c =================================================================== --- head/sys/geom/sched/gs_rr.c (revision 296605) +++ head/sys/geom/sched/gs_rr.c (revision 296606) @@ -1,699 +1,699 @@ /*- * Copyright (c) 2009-2010 Fabio Checconi * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * $FreeBSD$ * * A round-robin (RR) anticipatory scheduler, with per-client queues. * * The goal of this implementation is to improve throughput compared * to the pure elevator algorithm, and insure some fairness among * clients. * * Requests coming from the same client are put in the same queue. * We use anticipation to help reducing seeks, and each queue * is never served continuously for more than a given amount of * time or data. Queues are then served in a round-robin fashion. * * Each queue can be in any of the following states: * READY immediately serve the first pending request; * BUSY one request is under service, wait for completion; * IDLING do not serve incoming requests immediately, unless * they are "eligible" as defined later. * * Scheduling is made looking at the status of all queues, * and the first one in round-robin order is privileged. */ #include #include #include #include #include #include #include #include #include #include #include #include "gs_scheduler.h" /* possible states of the scheduler */ enum g_rr_state { G_QUEUE_READY = 0, /* Ready to dispatch. */ G_QUEUE_BUSY, /* Waiting for a completion. */ G_QUEUE_IDLING /* Waiting for a new request. */ }; /* possible queue flags */ enum g_rr_flags { /* G_FLAG_COMPLETED means that the field q_slice_end is valid. */ G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ }; struct g_rr_softc; /* * Queue descriptor, containing reference count, scheduling * state, a queue of pending requests, configuration parameters. * Queues with pending request(s) and not under service are also * stored in a Round Robin (RR) list. */ struct g_rr_queue { struct g_rr_softc *q_sc; /* link to the parent */ enum g_rr_state q_status; unsigned int q_service; /* service received so far */ int q_slice_end; /* actual slice end time, in ticks */ enum g_rr_flags q_flags; /* queue flags */ struct bio_queue_head q_bioq; /* Scheduling parameters */ unsigned int q_budget; /* slice size in bytes */ unsigned int q_slice_duration; /* slice size in ticks */ unsigned int q_wait_ticks; /* wait time for anticipation */ /* Stats to drive the various heuristics. */ struct g_savg q_thinktime; /* Thinktime average. */ struct g_savg q_seekdist; /* Seek distance average. */ int q_bionum; /* Number of requests. */ off_t q_lastoff; /* Last submitted req. offset. */ int q_lastsub; /* Last submitted req. time. */ /* Expiration deadline for an empty queue. */ int q_expire; TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ }; /* List types. */ TAILQ_HEAD(g_rr_tailq, g_rr_queue); /* list of scheduler instances */ LIST_HEAD(g_scheds, g_rr_softc); /* Default quantum for RR between queues. */ #define G_RR_DEFAULT_BUDGET 0x00800000 /* * Per device descriptor, holding the Round Robin list of queues * accessing the disk, a reference to the geom, and the timer. */ struct g_rr_softc { struct g_geom *sc_geom; /* * sc_active is the queue we are anticipating for. * It is set only in gs_rr_next(), and possibly cleared * only in gs_rr_next() or on a timeout. * The active queue is never in the Round Robin list * even if it has requests queued. */ struct g_rr_queue *sc_active; struct callout sc_wait; /* timer for sc_active */ struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ int sc_nqueues; /* number of queues */ /* Statistics */ int sc_in_flight; /* requests in the driver */ LIST_ENTRY(g_rr_softc) sc_next; }; /* Descriptor for bounded values, min and max are constant. */ struct x_bound { const int x_min; int x_cur; const int x_max; }; /* * parameters, config and stats */ struct g_rr_params { int queues; /* total number of queues */ int w_anticipate; /* anticipate writes */ int bypass; /* bypass scheduling writes */ int units; /* how many instances */ /* sc_head is used for debugging */ struct g_scheds sc_head; /* first scheduler instance */ struct x_bound queue_depth; /* max parallel requests */ struct x_bound wait_ms; /* wait time, milliseconds */ struct x_bound quantum_ms; /* quantum size, milliseconds */ struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ /* statistics */ int wait_hit; /* success in anticipation */ int wait_miss; /* failure in anticipation */ }; /* * Default parameters for the scheduler. The quantum sizes target * a 80MB/s disk; if the hw is faster or slower the minimum of the * two will have effect: the clients will still be isolated but * the fairness may be limited. A complete solution would involve * the on-line measurement of the actual disk throughput to derive * these parameters. Or we may just choose to ignore service domain * fairness and accept what can be achieved with time-only budgets. */ static struct g_rr_params me = { .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), .w_anticipate = 1, .queue_depth = { 1, 1, 50 }, .wait_ms = { 1, 10, 30 }, .quantum_ms = { 1, 100, 500 }, .quantum_kb = { 16, 8192, 65536 }, }; struct g_rr_params *gs_rr_me = &me; SYSCTL_DECL(_kern_geom_sched); static SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, "GEOM_SCHED ROUND ROBIN stuff"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, &me.units, 0, "Scheduler instances"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, &me.queues, 0, "Total rr queues"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, &me.wait_ms.x_cur, 0, "Wait time milliseconds"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, &me.bypass, 0, "Bypass scheduler"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, &me.w_anticipate, 0, "Do anticipation on writes"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, &me.wait_hit, 0, "Hits in anticipation"); SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, &me.wait_miss, 0, "Misses in anticipation"); #ifdef DEBUG_QUEUES /* print the status of a queue */ static void gs_rr_dump_q(struct g_rr_queue *qp, int index) { int l = 0; struct bio *bp; TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { l++; } printf("--- rr queue %d %p status %d len %d ---\n", index, qp, qp->q_status, l); } /* * Dump the scheduler status when writing to this sysctl variable. * XXX right now we only dump the status of the last instance created. * not a severe issue because this is only for debugging */ static int gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) { int error, val = 0; struct g_rr_softc *sc; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); printf("called %s\n", __FUNCTION__); LIST_FOREACH(sc, &me.sc_head, sc_next) { int i, tot = 0; printf("--- sc %p active %p nqueues %d " "callout %d in_flight %d ---\n", sc, sc->sc_active, sc->sc_nqueues, callout_active(&sc->sc_wait), sc->sc_in_flight); for (i = 0; i < G_RR_HASH_SIZE; i++) { struct g_rr_queue *qp; LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { gs_rr_dump_q(qp, tot); tot++; } } } return (0); } SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); #endif /* DEBUG_QUEUES */ /* * Get a bounded value, optionally convert to a min of t_min ticks. */ static int get_bounded(struct x_bound *v, int t_min) { int x; x = v->x_cur; if (x < v->x_min) x = v->x_min; else if (x > v->x_max) x = v->x_max; if (t_min) { x = x * hz / 1000; /* convert to ticks */ if (x < t_min) x = t_min; } return x; } /* * Get a reference to the queue for bp, using the generic * classification mechanism. */ static struct g_rr_queue * g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) { return (g_sched_get_class(sc->sc_geom, bp)); } static int g_rr_init_class(void *data, void *priv) { struct g_rr_softc *sc = data; struct g_rr_queue *qp = priv; bioq_init(&qp->q_bioq); /* * Set the initial parameters for the client: * slice size in bytes and ticks, and wait ticks. * Right now these are constant, but we could have * autoconfiguration code to adjust the values based on * the actual workload. */ qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); qp->q_sc = sc; /* link to the parent */ qp->q_sc->sc_nqueues++; me.queues++; return (0); } /* * Release a reference to the queue. */ static void g_rr_queue_put(struct g_rr_queue *qp) { g_sched_put_class(qp->q_sc->sc_geom, qp); } static void g_rr_fini_class(void *data, void *priv) { struct g_rr_queue *qp = priv; KASSERT(bioq_first(&qp->q_bioq) == NULL, ("released nonempty queue")); qp->q_sc->sc_nqueues--; me.queues--; } static inline int g_rr_queue_expired(struct g_rr_queue *qp) { if (qp->q_service >= qp->q_budget) return (1); if ((qp->q_flags & G_FLAG_COMPLETED) && ticks - qp->q_slice_end >= 0) return (1); return (0); } static inline int g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) { int wait = get_bounded(&me.wait_ms, 2); - if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE)) + if (!me.w_anticipate && (bp->bio_cmd == BIO_WRITE)) return (0); if (g_savg_valid(&qp->q_thinktime) && g_savg_read(&qp->q_thinktime) > wait) return (0); if (g_savg_valid(&qp->q_seekdist) && g_savg_read(&qp->q_seekdist) > 8192) return (0); return (1); } /* * Called on a request arrival, timeout or completion. * Try to serve a request among those queued. */ static struct bio * g_rr_next(void *data, int force) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; struct bio *bp, *next; int expired; qp = sc->sc_active; if (me.bypass == 0 && !force) { if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) return (NULL); /* Try with the queue under service first. */ if (qp != NULL && qp->q_status != G_QUEUE_READY) { /* * Queue is anticipating, ignore request. * We should check that we are not past * the timeout, but in that case the timeout * will fire immediately afterwards so we * don't bother. */ return (NULL); } } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { g_rr_queue_put(qp); sc->sc_active = qp = NULL; } /* * No queue under service, look for the first in RR order. * If we find it, select if as sc_active, clear service * and record the end time of the slice. */ if (qp == NULL) { qp = TAILQ_FIRST(&sc->sc_rr_tailq); if (qp == NULL) return (NULL); /* no queues at all, return */ /* otherwise select the new queue for service. */ TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); sc->sc_active = qp; qp->q_service = 0; qp->q_flags &= ~G_FLAG_COMPLETED; } bp = bioq_takefirst(&qp->q_bioq); /* surely not NULL */ qp->q_service += bp->bio_length; /* charge the service */ /* * The request at the head of the active queue is always * dispatched, and gs_rr_next() will be called again * immediately. * We need to prepare for what to do next: * * 1. have we reached the end of the (time or service) slice ? * If so, clear sc_active and possibly requeue the previous * active queue if it has more requests pending; * 2. do we have more requests in sc_active ? * If yes, do not anticipate, as gs_rr_next() will run again; * if no, decide whether or not to anticipate depending * on read or writes (e.g., anticipate only on reads). */ expired = g_rr_queue_expired(qp); /* are we expired ? */ next = bioq_first(&qp->q_bioq); /* do we have one more ? */ if (expired) { sc->sc_active = NULL; /* Either requeue or release reference. */ if (next != NULL) TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); else g_rr_queue_put(qp); } else if (next != NULL) { qp->q_status = G_QUEUE_READY; } else { if (!force && g_rr_should_anticipate(qp, bp)) { /* anticipate */ qp->q_status = G_QUEUE_BUSY; } else { /* do not anticipate, release reference */ g_rr_queue_put(qp); sc->sc_active = NULL; } } /* If sc_active != NULL, its q_status is always correct. */ sc->sc_in_flight++; return (bp); } static inline void g_rr_update_thinktime(struct g_rr_queue *qp) { int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); if (qp->q_sc->sc_active != qp) return; qp->q_lastsub = ticks; delta = (delta > 2 * wait) ? 2 * wait : delta; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_thinktime, delta); } static inline void g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) { off_t dist; if (qp->q_lastoff > bp->bio_offset) dist = qp->q_lastoff - bp->bio_offset; else dist = bp->bio_offset - qp->q_lastoff; if (dist > (8192 * 8)) dist = 8192 * 8; qp->q_lastoff = bp->bio_offset + bp->bio_length; if (qp->q_bionum > 7) g_savg_add_sample(&qp->q_seekdist, dist); } /* * Called when a real request for disk I/O arrives. * Locate the queue associated with the client. * If the queue is the one we are anticipating for, reset its timeout; * if the queue is not in the round robin list, insert it in the list. * On any error, do not queue the request and return -1, the caller * will take care of this request. */ static int g_rr_start(void *data, struct bio *bp) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; if (me.bypass) return (-1); /* bypass the scheduler */ /* Get the queue for the request. */ qp = g_rr_queue_get(sc, bp); if (qp == NULL) return (-1); /* allocation failed, tell upstream */ if (bioq_first(&qp->q_bioq) == NULL) { /* * We are inserting into an empty queue. * Reset its state if it is sc_active, * otherwise insert it in the RR list. */ if (qp == sc->sc_active) { qp->q_status = G_QUEUE_READY; callout_stop(&sc->sc_wait); } else { g_sched_priv_ref(qp); TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); } } qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); g_rr_update_thinktime(qp); g_rr_update_seekdist(qp, bp); /* Inherit the reference returned by g_rr_queue_get(). */ bp->bio_caller1 = qp; bioq_disksort(&qp->q_bioq, bp); return (0); } /* * Callout executed when a queue times out anticipating a new request. */ static void g_rr_wait_timeout(void *data) { struct g_rr_softc *sc = data; struct g_geom *geom = sc->sc_geom; g_sched_lock(geom); /* * We can race with other events, so check if * sc_active is still valid. */ if (sc->sc_active != NULL) { /* Release the reference to the queue. */ g_rr_queue_put(sc->sc_active); sc->sc_active = NULL; me.wait_hit--; me.wait_miss++; /* record the miss */ } g_sched_dispatch(geom); g_sched_unlock(geom); } /* * Module glue: allocate descriptor, initialize its fields. */ static void * g_rr_init(struct g_geom *geom) { struct g_rr_softc *sc; /* XXX check whether we can sleep */ sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); sc->sc_geom = geom; TAILQ_INIT(&sc->sc_rr_tailq); callout_init(&sc->sc_wait, 1); LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); me.units++; return (sc); } /* * Module glue -- drain the callout structure, destroy the * hash table and its element, and free the descriptor. */ static void g_rr_fini(void *data) { struct g_rr_softc *sc = data; callout_drain(&sc->sc_wait); KASSERT(sc->sc_active == NULL, ("still a queue under service")); KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); LIST_REMOVE(sc, sc_next); me.units--; free(sc, M_GEOM_SCHED); } /* * Called when the request under service terminates. * Start the anticipation timer if needed. */ static void g_rr_done(void *data, struct bio *bp) { struct g_rr_softc *sc = data; struct g_rr_queue *qp; sc->sc_in_flight--; qp = bp->bio_caller1; /* * When the first request for this queue completes, update the * duration and end of the slice. We do not do it when the * slice starts to avoid charging to the queue the time for * the first seek. */ if (!(qp->q_flags & G_FLAG_COMPLETED)) { qp->q_flags |= G_FLAG_COMPLETED; /* * recompute the slice duration, in case we want * to make it adaptive. This is not used right now. * XXX should we do the same for q_quantum and q_wait_ticks ? */ qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); qp->q_slice_end = ticks + qp->q_slice_duration; } if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { /* The queue is trying anticipation, start the timer. */ qp->q_status = G_QUEUE_IDLING; /* may make this adaptive */ qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); me.wait_hit++; callout_reset(&sc->sc_wait, qp->q_wait_ticks, g_rr_wait_timeout, sc); } else g_sched_dispatch(sc->sc_geom); /* Release a reference to the queue. */ g_rr_queue_put(qp); } static void g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { if (indent == NULL) { /* plaintext */ sbuf_printf(sb, " units %d queues %d", me.units, me.queues); } } static struct g_gsched g_rr = { .gs_name = "rr", .gs_priv_size = sizeof(struct g_rr_queue), .gs_init = g_rr_init, .gs_fini = g_rr_fini, .gs_start = g_rr_start, .gs_done = g_rr_done, .gs_next = g_rr_next, .gs_dumpconf = g_rr_dumpconf, .gs_init_class = g_rr_init_class, .gs_fini_class = g_rr_fini_class, }; DECLARE_GSCHED_MODULE(rr, &g_rr);