Index: head/lib/geom/sched/gsched.8 =================================================================== --- head/lib/geom/sched/gsched.8 (revision 356184) +++ head/lib/geom/sched/gsched.8 (nonexistent) @@ -1,162 +0,0 @@ -.\" Copyright (c) 2009-2010 Fabio Checconi -.\" Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa -.\" All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" $FreeBSD$ -.\" -.Dd July 26, 2012 -.Dt GSCHED 8 -.Os -.Sh NAME -.Nm gsched -.Nd "control utility for disk scheduler GEOM class" -.Sh SYNOPSIS -.Nm -.Cm create -.Op Fl v -.Op Fl a Ar algorithm -.Ar provider ... -.Nm -.Cm insert -.Op Fl v -.Op Fl a Ar algorithm -.Ar provider ... -.Nm -.Cm configure -.Op Fl v -.Op Fl a Ar algorithm -.Ar node ... -.Nm -.Cm destroy -.Op Fl fv -.Ar node ... -.Nm -.Cm reset -.Op Fl v -.Ar node ... -.Nm -.Cm { list | status | load | unload } -.Sh DESCRIPTION -The -.Nm -utility (also callable as -.Nm geom sched ... ) -changes the scheduling policy of the requests going to a provider. -.Pp -The first argument to -.Nm -indicates an action to be performed: -.Bl -tag -width ".Cm configure" -.It Cm create -Create a new provider and geom node using the specified scheduling algorithm. -.Ar algorithm -is the name of the scheduling algorithm used for the provider. -Available algorithms include: -.Ar rr , -which implements anticipatory scheduling with round robin service -among clients; -.Ar as , -which implements a simple form of anticipatory scheduling with -no per-client queue. -.Pp -If the operation succeeds, the new provider should appear with name -.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. . -The kernel module -.Pa geom_sched.ko -will be loaded if it is not loaded already. -.It Cm insert -Operates as "create", but the insertion is "transparent", -i.e. the existing provider is rerouted to the newly created geom, -which in turn forwards requests to the existing geom. -This operation allows one to start/stop a scheduling service -on an already existing provider. -.Pp -A subsequent "destroy" will remove the newly created geom and -hook the provider back to the original geom. -.It Cm configure -Configure existing scheduling provider. It supports the same options -as the -.Nm create -command. -.It Cm destroy -Destroy the geom specified in the parameter. -.It Cm reset -Do nothing. -.It Cm list | status | load | unload -See -.Xr geom 8 . -.El -.Pp -Additional options: -.Bl -tag -width ".Fl f" -.It Fl f -Force the removal of the specified provider. -.It Fl v -Be more verbose. -.El -.Sh SYSCTL VARIABLES -The following -.Xr sysctl 8 -variables can be used to control the behavior of the -.Nm SCHED -GEOM class. -The default value is shown next to each variable. -.Bl -tag -width indent -.It Va kern.geom.sched.debug : No 0 -Debug level of the -.Nm SCHED -GEOM class. -This can be set to a number between 0 and 2 inclusive. -If set to 0 minimal debug information is printed, and if set to 2 the -maximum amount of debug information is printed. -.El -.Sh EXIT STATUS -Exit status is 0 on success, and 1 if the command fails. -.Sh EXAMPLES -The following example shows how to create a scheduling provider for disk -.Pa /dev/ada0 , -and how to destroy it. -.Bd -literal -offset indent -# Load the geom_sched module: -kldload geom_sched -# Load some scheduler classes used by geom_sched: -kldload gsched_rr -# Configure device ada0 to use scheduler "rr": -geom sched insert -a rr ada0 -# Now provider ada0 uses the "rr" algorithm; -# the new geom is ada0.sched. -# Remove the scheduler on the device: -geom sched destroy -v ada0.sched. -.Ed -.Sh SEE ALSO -.Xr geom 4 , -.Xr geom 8 -.Sh HISTORY -The -.Nm -utility first appeared in -.Fx 8.1 . -.Sh AUTHORS -.An Fabio Checconi Aq Mt fabio@FreeBSD.org -.An Luigi Rizzo Aq Mt luigi@FreeBSD.org Property changes on: head/lib/geom/sched/gsched.8 ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/lib/geom/sched/Makefile.depend =================================================================== --- head/lib/geom/sched/Makefile.depend (revision 356184) +++ head/lib/geom/sched/Makefile.depend (nonexistent) @@ -1,19 +0,0 @@ -# $FreeBSD$ -# Autogenerated - do NOT edit! - -DIRDEPS = \ - gnu/lib/csu \ - include \ - include/xlocale \ - lib/${CSU_DIR} \ - lib/libc \ - lib/libcompiler_rt \ - lib/libgeom \ - sbin/geom/core \ - - -.include - -.if ${DEP_RELDIR} == ${_DEP_RELDIR} -# local dependencies - needed for -jN in clean tree -.endif Property changes on: head/lib/geom/sched/Makefile.depend ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: head/lib/geom/sched/geom_sched.c =================================================================== --- head/lib/geom/sched/geom_sched.c (revision 356184) +++ head/lib/geom/sched/geom_sched.c (nonexistent) @@ -1,128 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2009 Fabio Checconi - * Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $Id$ - * $FreeBSD$ - * - * This file implements the userspace library used by the 'geom' - * command to load and manipulate disk schedulers. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include "core/geom.h" -#include "misc/subr.h" - -#define G_SCHED_VERSION 0 - -uint32_t lib_version = G_LIB_VERSION; -uint32_t version = G_SCHED_VERSION; - -/* - * storage for parameters used by this geom class. - * Right now only the scheduler name is used. - */ -#define GSCHED_ALGO "rr" /* default scheduler */ - -/* - * Adapt to differences in geom library. - * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined - */ -#if G_LIB_VERSION <= 1 -#define G_TYPE_BOOL G_TYPE_NUMBER -#endif -#if G_LIB_VERSION >= 3 && G_LIB_VERSION <= 4 -#define G_ARGNAME NULL, -#else -#define G_ARGNAME -#endif - -static void -gcmd_createinsert(struct gctl_req *req, unsigned flags __unused) -{ - const char *reqalgo; - char name[64]; - - if (gctl_has_param(req, "algo")) - reqalgo = gctl_get_ascii(req, "algo"); - else - reqalgo = GSCHED_ALGO; - - snprintf(name, sizeof(name), "gsched_%s", reqalgo); - /* - * Do not complain about errors here, gctl_issue() - * will fail anyway. - */ - if (modfind(name) < 0) - kldload(name); - gctl_issue(req); -} - -struct g_command class_commands[] = { - { "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, - { - { 'a', "algo", GSCHED_ALGO, G_TYPE_STRING }, - G_OPT_SENTINEL - }, - G_ARGNAME "[-v] [-a algorithm_name] dev ..." - }, - { "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, - { - { 'a', "algo", GSCHED_ALGO, G_TYPE_STRING }, - G_OPT_SENTINEL - }, - G_ARGNAME "[-v] [-a algorithm_name] dev ..." - }, - { "configure", G_FLAG_VERBOSE, NULL, - { - { 'a', "algo", GSCHED_ALGO, G_TYPE_STRING }, - G_OPT_SENTINEL - }, - G_ARGNAME "[-v] [-a algorithm_name] prov ..." - }, - { "destroy", G_FLAG_VERBOSE, NULL, - { - { 'f', "force", NULL, G_TYPE_BOOL }, - G_OPT_SENTINEL - }, - G_ARGNAME "[-fv] prov ..." - }, - { "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, - G_ARGNAME "[-v] prov ..." - }, - G_CMD_SENTINEL -}; Property changes on: head/lib/geom/sched/geom_sched.c ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/lib/geom/sched/Makefile =================================================================== --- head/lib/geom/sched/Makefile (revision 356184) +++ head/lib/geom/sched/Makefile (nonexistent) @@ -1,9 +0,0 @@ -# GEOM_LIBRARY_PATH -# $FreeBSD$ - -PACKAGE=runtime -.PATH: ${.CURDIR:H:H}/misc - -GEOM_CLASS= sched - -.include Property changes on: head/lib/geom/sched/Makefile ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/lib/geom/Makefile.classes =================================================================== --- head/lib/geom/Makefile.classes (revision 356184) +++ head/lib/geom/Makefile.classes (revision 356185) @@ -1,26 +1,25 @@ # $FreeBSD$ .if !defined(COMPAT_32BIT) GEOM_CLASS_DIR?=/lib/geom .else GEOM_CLASS_DIR?=/usr/lib32/geom .endif GEOM_CLASSES= cache GEOM_CLASSES+= concat .if ${MK_OPENSSL} != "no" GEOM_CLASSES+= eli .endif GEOM_CLASSES+= journal GEOM_CLASSES+= label GEOM_CLASSES+= mirror GEOM_CLASSES+= mountver GEOM_CLASSES+= multipath GEOM_CLASSES+= nop GEOM_CLASSES+= part GEOM_CLASSES+= raid GEOM_CLASSES+= raid3 -GEOM_CLASSES+= sched GEOM_CLASSES+= shsec GEOM_CLASSES+= stripe GEOM_CLASSES+= virstor Index: head/sys/geom/sched/g_sched.h =================================================================== --- head/sys/geom/sched/g_sched.h (revision 356184) +++ head/sys/geom/sched/g_sched.h (nonexistent) @@ -1,111 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2009-2010 Fabio Checconi - * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _G_SCHED_H_ -#define _G_SCHED_H_ - -/* - * $Id$ - * $FreeBSD$ - * - * Header for the geom_sched class (userland library and kernel part). - * See g_sched.c for documentation. - * The userland code only needs the three G_SCHED_* values below. - */ - -#define G_SCHED_CLASS_NAME "SCHED" -#define G_SCHED_VERSION 0 -#define G_SCHED_SUFFIX ".sched." - -#ifdef _KERNEL -#define G_SCHED_DEBUG(lvl, ...) \ - _GEOM_DEBUG("GEOM_SCHED", me.gs_debug, (lvl), NULL, __VA_ARGS__) -#define G_SCHED_LOGREQ(bp, ...) \ - _GEOM_DEBUG("GEOM_SCHED", me.gs_debug, 2, (bp), __VA_ARGS__) - -LIST_HEAD(g_hash, g_sched_class); - -/* - * Descriptor of a scheduler. - * In addition to the obvious fields, sc_flushing and sc_pending - * support dynamic switching of scheduling algorithm. - * Normally, sc_flushing is 0, and requests that are scheduled are - * also added to the sc_pending queue, and removed when we receive - * the 'done' event. - * - * When we are transparently inserted on an existing provider, - * sc_proxying is set. The detach procedure is slightly different. - * - * When switching schedulers, sc_flushing is set so requests bypass us, - * and at the same time we update the pointer in the pending bios - * to ignore us when they return up. - * XXX it would be more efficient to implement sc_pending with - * a generation number: the softc generation is increased when - * we change scheduling algorithm, we store the current generation - * number in the pending bios, and when they come back we ignore - * the done() call if the generation number do not match. - */ -struct g_sched_softc { - /* - * Generic fields used by any scheduling algorithm: - * a mutex, the class descriptor, flags, list of pending - * requests (used when flushing the module) and support - * for hash tables where we store per-flow queues. - */ - struct mtx sc_mtx; - struct g_gsched *sc_gsched; /* Scheduler descriptor. */ - int sc_pending; /* Pending requests. */ - int sc_flags; /* Various flags. */ - - /* - * Hash tables to store per-flow queues are generally useful - * so we handle them in the common code. - * sc_hash and sc_mask are parameters of the hash table, - * the last two fields are used to periodically remove - * expired items from the hash table. - */ - struct g_hash *sc_hash; - u_long sc_mask; - int sc_flush_ticks; /* Next tick for a flush. */ - int sc_flush_bucket; /* Next bucket to flush. */ - - /* - * Pointer to the algorithm's private data, which is the value - * returned by sc_gsched->gs_init() . A NULL here means failure. - * XXX intptr_t might be more appropriate. - */ - void *sc_data; -}; - -#define G_SCHED_PROXYING 1 -#define G_SCHED_FLUSHING 2 - -#endif /* _KERNEL */ - -#endif /* _G_SCHED_H_ */ Property changes on: head/sys/geom/sched/g_sched.h ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/geom/sched/README =================================================================== --- head/sys/geom/sched/README (revision 356184) +++ head/sys/geom/sched/README (nonexistent) @@ -1,142 +0,0 @@ - - --- GEOM BASED DISK SCHEDULERS FOR FREEBSD --- - -This code contains a framework for GEOM-based disk schedulers and a -couple of sample scheduling algorithms that use the framework and -implement two forms of "anticipatory scheduling" (see below for more -details). - -As a quick example of what this code can give you, try to run "dd", -"tar", or some other program with highly SEQUENTIAL access patterns, -together with "cvs", "cvsup", "svn" or other highly RANDOM access patterns -(this is not a made-up example: it is pretty common for developers -to have one or more apps doing random accesses, and others that do -sequential accesses e.g., loading large binaries from disk, checking -the integrity of tarballs, watching media streams and so on). - -These are the results we get on a local machine (AMD BE2400 dual -core CPU, SATA 250GB disk): - - /mnt is a partition mounted on /dev/ad0s1f - - cvs: cvs -d /mnt/home/ncvs-local update -Pd /mnt/ports - dd-read: dd bs=128k of=/dev/null if=/dev/ad0 (or ad0-sched-) - dd-writew dd bs=128k if=/dev/zero of=/mnt/largefile - - NO SCHEDULER RR SCHEDULER - dd cvs dd cvs - - dd-read only 72 MB/s ---- 72 MB/s --- - dd-write only 55 MB/s --- 55 MB/s --- - dd-read+cvs 6 MB/s ok 30 MB/s ok - dd-write+cvs 55 MB/s slooow 14 MB/s ok - -As you can see, when a cvs is running concurrently with dd, the -performance drops dramatically, and depending on read or write mode, -one of the two is severely penalized. The use of the RR scheduler -in this example makes the dd-reader go much faster when competing -with cvs, and lets cvs progress when competing with a writer. - -To try it out: - -1. PLEASE MAKE SURE THAT THE DISK THAT YOU WILL BE USING FOR TESTS - DOES NOT CONTAIN PRECIOUS DATA. - This is experimental code, so we make no guarantees, though - I am routinely using it on my desktop and laptop. - -2. EXTRACT AND BUILD THE PROGRAMS - A 'make install' in the directory should work (with root privs), - or you can even try the binary modules. - If you want to build the modules yourself, look at the Makefile. - -3. LOAD THE MODULE, CREATE A GEOM NODE, RUN TESTS - - The scheduler's module must be loaded first: - - # kldload gsched_rr - - substitute with gsched_as to test AS. Then, supposing that you are - using /dev/ad0 for testing, a scheduler can be attached to it with: - - # geom sched insert ad0 - - The scheduler is inserted transparently in the geom chain, so - mounted partitions and filesystems will keep working, but - now requests will go through the scheduler. - - To change scheduler on-the-fly, you can reconfigure the geom: - - # geom sched configure -a as ad0.sched. - - assuming that gsched_as was loaded previously. - -5. SCHEDULER REMOVAL - - In principle it is possible to remove the scheduler module - even on an active chain by doing - - # geom sched destroy ad0.sched. - - However, there is some race in the geom subsystem which makes - the removal unsafe if there are active requests on a chain. - So, in order to reduce the risk of data losses, make sure - you don't remove a scheduler from a chain with ongoing transactions. - ---- NOTES ON THE SCHEDULERS --- - -The important contribution of this code is the framework to experiment -with different scheduling algorithms. 'Anticipatory scheduling' -is a very powerful technique based on the following reasoning: - - The disk throughput is much better if it serves sequential requests. - If we have a mix of sequential and random requests, and we see a - non-sequential request, do not serve it immediately but instead wait - a little bit (2..5ms) to see if there is another one coming that - the disk can serve more efficiently. - -There are many details that should be added to make sure that the -mechanism is effective with different workloads and systems, to -gain a few extra percent in performance, to improve fairness, -insulation among processes etc. A discussion of the vast literature -on the subject is beyond the purpose of this short note. - --------------------------------------------------------------------------- - -TRANSPARENT INSERT/DELETE - -geom_sched is an ordinary geom module, however it is convenient -to plug it transparently into the geom graph, so that one can -enable or disable scheduling on a mounted filesystem, and the -names in /etc/fstab do not depend on the presence of the scheduler. - -To understand how this works in practice, remember that in GEOM -we have "providers" and "geom" objects. -Say that we want to hook a scheduler on provider "ad0", -accessible through pointer 'pp'. Originally, pp is attached to -geom "ad0" (same name, different object) accessible through pointer old_gp - - BEFORE ---> [ pp --> old_gp ...] - -A normal "geom sched create ad0" call would create a new geom node -on top of provider ad0/pp, and export a newly created provider -("ad0.sched." accessible through pointer newpp). - - AFTER create ---> [ newpp --> gp --> cp ] ---> [ pp --> old_gp ... ] - -On top of newpp, a whole tree will be created automatically, and we -can e.g. mount partitions on /dev/ad0.sched.s1d, and those requests -will go through the scheduler, whereas any partition mounted on -the pre-existing device entries will not go through the scheduler. - -With the transparent insert mechanism, the original provider "ad0"/pp -is hooked to the newly created geom, as follows: - - AFTER insert ---> [ pp --> gp --> cp ] ---> [ newpp --> old_gp ... ] - -so anything that was previously using provider pp will now have -the requests routed through the scheduler node. - -A removal ("geom sched destroy ad0.sched.") will restore the original -configuration. - -# $FreeBSD$ Property changes on: head/sys/geom/sched/README ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/geom/sched/gs_scheduler.h =================================================================== --- head/sys/geom/sched/gs_scheduler.h (revision 356184) +++ head/sys/geom/sched/gs_scheduler.h (nonexistent) @@ -1,239 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2009-2010 Fabio Checconi - * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $Id$ - * $FreeBSD$ - * - * Prototypes for GEOM-based disk scheduling algorithms. - * See g_sched.c for generic documentation. - * - * This file is used by the kernel modules implementing the various - * scheduling algorithms. They should provide all the methods - * defined in struct g_gsched, and also invoke the macro - * DECLARE_GSCHED_MODULE - * which registers the scheduling algorithm with the geom_sched module. - * - * The various scheduling algorithms do not need to know anything - * about geom, they only need to handle the 'bio' requests they - * receive, pass them down when needed, and use the locking interface - * defined below. - */ - -#ifndef _G_GSCHED_H_ -#define _G_GSCHED_H_ - -#ifdef _KERNEL -#include -#include -#include -#include -#include -#include -#include "g_sched.h" - -/* - * This is the interface exported to scheduling modules. - * - * gs_init() is called when our scheduling algorithm - * starts being used by a geom 'sched' - * - * gs_fini() is called when the algorithm is released. - * - * gs_start() is called when a new request comes in. It should - * enqueue the request and return 0 if success, or return non-zero - * in case of failure (meaning the request is passed down). - * The scheduler can use bio->bio_caller1 to store a non-null - * pointer meaning the request is under its control. - * - * gs_next() is called in a loop by g_sched_dispatch(), right after - * gs_start(), or on timeouts or 'done' events. It should return - * immediately, either a pointer to the bio to be served or NULL - * if no bio should be served now. If force is specified, a - * work-conserving behavior is expected. - * - * gs_done() is called when a request under service completes. - * In turn the scheduler may decide to call the dispatch loop - * to serve other pending requests (or make sure there is a pending - * timeout to avoid stalls). - * - * gs_init_class() is called when a new client (as determined by - * the classifier) starts being used. - * - * gs_hash_unref() is called right before the class hashtable is - * destroyed; after this call, the scheduler is supposed to hold no - * more references to the elements in the table. - */ - -/* Forward declarations for prototypes. */ -struct g_geom; -struct g_sched_class; - -typedef void *gs_init_t (struct g_geom *geom); -typedef void gs_fini_t (void *data); -typedef int gs_start_t (void *data, struct bio *bio); -typedef void gs_done_t (void *data, struct bio *bio); -typedef struct bio *gs_next_t (void *data, int force); -typedef int gs_init_class_t (void *data, void *priv); -typedef void gs_fini_class_t (void *data, void *priv); -typedef void gs_hash_unref_t (void *data); - -struct g_gsched { - const char *gs_name; - int gs_refs; - int gs_priv_size; - - gs_init_t *gs_init; - gs_fini_t *gs_fini; - gs_start_t *gs_start; - gs_done_t *gs_done; - gs_next_t *gs_next; - g_dumpconf_t *gs_dumpconf; - - gs_init_class_t *gs_init_class; - gs_fini_class_t *gs_fini_class; - gs_hash_unref_t *gs_hash_unref; - - LIST_ENTRY(g_gsched) glist; -}; - -#define KTR_GSCHED KTR_SPARE4 - -MALLOC_DECLARE(M_GEOM_SCHED); - -/* - * Basic classification mechanism. Each request is associated to - * a g_sched_class, and each scheduler has the opportunity to set - * its own private data for the given (class, geom) pair. The - * private data have a base type of g_sched_private, and are - * extended at the end with the actual private fields of each - * scheduler. - */ -struct g_sched_class { - int gsc_refs; - int gsc_expire; - u_long gsc_key; - LIST_ENTRY(g_sched_class) gsc_clist; - - void *gsc_priv[0]; -}; - -/* - * Manipulate the classifier's data. g_sched_get_class() gets a reference - * to the class corresponding to bp in gp, allocating and initializing - * it if necessary. g_sched_put_class() releases the reference. - * The returned value points to the private data for the class. - */ -void *g_sched_get_class(struct g_geom *gp, struct bio *bp); -void g_sched_put_class(struct g_geom *gp, void *priv); - -static inline struct g_sched_class * -g_sched_priv2class(void *priv) -{ - - return ((struct g_sched_class *)((u_long)priv - - offsetof(struct g_sched_class, gsc_priv))); -} - -static inline void -g_sched_priv_ref(void *priv) -{ - struct g_sched_class *gsc; - - gsc = g_sched_priv2class(priv); - gsc->gsc_refs++; -} - -/* - * Locking interface. When each operation registered with the - * scheduler is invoked, a per-instance lock is taken to protect - * the data associated with it. If the scheduler needs something - * else to access the same data (e.g., a callout) it must use - * these functions. - */ -void g_sched_lock(struct g_geom *gp); -void g_sched_unlock(struct g_geom *gp); - -/* - * Restart request dispatching. Must be called with the per-instance - * mutex held. - */ -void g_sched_dispatch(struct g_geom *geom); - -/* - * Simple gathering of statistical data, used by schedulers to collect - * info on process history. Just keep an exponential average of the - * samples, with some extra bits of precision. - */ -struct g_savg { - uint64_t gs_avg; - unsigned int gs_smpl; -}; - -static inline void -g_savg_add_sample(struct g_savg *ss, uint64_t sample) -{ - - /* EMA with alpha = 0.125, fixed point, 3 bits of precision. */ - ss->gs_avg = sample + ss->gs_avg - (ss->gs_avg >> 3); - ss->gs_smpl = 1 + ss->gs_smpl - (ss->gs_smpl >> 3); -} - -static inline int -g_savg_valid(struct g_savg *ss) -{ - - /* We want at least 8 samples to deem an average as valid. */ - return (ss->gs_smpl > 7); -} - -static inline uint64_t -g_savg_read(struct g_savg *ss) -{ - - return (ss->gs_avg / ss->gs_smpl); -} - -/* - * Declaration of a scheduler module. - */ -int g_gsched_modevent(module_t mod, int cmd, void *arg); - -#define DECLARE_GSCHED_MODULE(name, gsched) \ - static moduledata_t name##_mod = { \ - #name, \ - g_gsched_modevent, \ - gsched, \ - }; \ - DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); \ - MODULE_DEPEND(name, geom_sched, 0, 0, 0); - -#endif /* _KERNEL */ - -#endif /* _G_GSCHED_H_ */ Property changes on: head/sys/geom/sched/gs_scheduler.h ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/geom/sched/gs_rr.c =================================================================== --- head/sys/geom/sched/gs_rr.c (revision 356184) +++ head/sys/geom/sched/gs_rr.c (nonexistent) @@ -1,701 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2009-2010 Fabio Checconi - * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $Id$ - * $FreeBSD$ - * - * A round-robin (RR) anticipatory scheduler, with per-client queues. - * - * The goal of this implementation is to improve throughput compared - * to the pure elevator algorithm, and insure some fairness among - * clients. - * - * Requests coming from the same client are put in the same queue. - * We use anticipation to help reducing seeks, and each queue - * is never served continuously for more than a given amount of - * time or data. Queues are then served in a round-robin fashion. - * - * Each queue can be in any of the following states: - * READY immediately serve the first pending request; - * BUSY one request is under service, wait for completion; - * IDLING do not serve incoming requests immediately, unless - * they are "eligible" as defined later. - * - * Scheduling is made looking at the status of all queues, - * and the first one in round-robin order is privileged. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "gs_scheduler.h" - -/* possible states of the scheduler */ -enum g_rr_state { - G_QUEUE_READY = 0, /* Ready to dispatch. */ - G_QUEUE_BUSY, /* Waiting for a completion. */ - G_QUEUE_IDLING /* Waiting for a new request. */ -}; - -/* possible queue flags */ -enum g_rr_flags { - /* G_FLAG_COMPLETED means that the field q_slice_end is valid. */ - G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ -}; - -struct g_rr_softc; - -/* - * Queue descriptor, containing reference count, scheduling - * state, a queue of pending requests, configuration parameters. - * Queues with pending request(s) and not under service are also - * stored in a Round Robin (RR) list. - */ -struct g_rr_queue { - struct g_rr_softc *q_sc; /* link to the parent */ - - enum g_rr_state q_status; - unsigned int q_service; /* service received so far */ - int q_slice_end; /* actual slice end time, in ticks */ - enum g_rr_flags q_flags; /* queue flags */ - struct bio_queue_head q_bioq; - - /* Scheduling parameters */ - unsigned int q_budget; /* slice size in bytes */ - unsigned int q_slice_duration; /* slice size in ticks */ - unsigned int q_wait_ticks; /* wait time for anticipation */ - - /* Stats to drive the various heuristics. */ - struct g_savg q_thinktime; /* Thinktime average. */ - struct g_savg q_seekdist; /* Seek distance average. */ - - int q_bionum; /* Number of requests. */ - - off_t q_lastoff; /* Last submitted req. offset. */ - int q_lastsub; /* Last submitted req. time. */ - - /* Expiration deadline for an empty queue. */ - int q_expire; - - TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ -}; - -/* List types. */ -TAILQ_HEAD(g_rr_tailq, g_rr_queue); - -/* list of scheduler instances */ -LIST_HEAD(g_scheds, g_rr_softc); - -/* Default quantum for RR between queues. */ -#define G_RR_DEFAULT_BUDGET 0x00800000 - -/* - * Per device descriptor, holding the Round Robin list of queues - * accessing the disk, a reference to the geom, and the timer. - */ -struct g_rr_softc { - struct g_geom *sc_geom; - - /* - * sc_active is the queue we are anticipating for. - * It is set only in gs_rr_next(), and possibly cleared - * only in gs_rr_next() or on a timeout. - * The active queue is never in the Round Robin list - * even if it has requests queued. - */ - struct g_rr_queue *sc_active; - struct callout sc_wait; /* timer for sc_active */ - - struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ - int sc_nqueues; /* number of queues */ - - /* Statistics */ - int sc_in_flight; /* requests in the driver */ - - LIST_ENTRY(g_rr_softc) sc_next; -}; - -/* Descriptor for bounded values, min and max are constant. */ -struct x_bound { - const int x_min; - int x_cur; - const int x_max; -}; - -/* - * parameters, config and stats - */ -struct g_rr_params { - int queues; /* total number of queues */ - int w_anticipate; /* anticipate writes */ - int bypass; /* bypass scheduling writes */ - - int units; /* how many instances */ - /* sc_head is used for debugging */ - struct g_scheds sc_head; /* first scheduler instance */ - - struct x_bound queue_depth; /* max parallel requests */ - struct x_bound wait_ms; /* wait time, milliseconds */ - struct x_bound quantum_ms; /* quantum size, milliseconds */ - struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ - - /* statistics */ - int wait_hit; /* success in anticipation */ - int wait_miss; /* failure in anticipation */ -}; - -/* - * Default parameters for the scheduler. The quantum sizes target - * a 80MB/s disk; if the hw is faster or slower the minimum of the - * two will have effect: the clients will still be isolated but - * the fairness may be limited. A complete solution would involve - * the on-line measurement of the actual disk throughput to derive - * these parameters. Or we may just choose to ignore service domain - * fairness and accept what can be achieved with time-only budgets. - */ -static struct g_rr_params me = { - .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), - .w_anticipate = 1, - .queue_depth = { 1, 1, 50 }, - .wait_ms = { 1, 10, 30 }, - .quantum_ms = { 1, 100, 500 }, - .quantum_kb = { 16, 8192, 65536 }, -}; - -struct g_rr_params *gs_rr_me = &me; - -SYSCTL_DECL(_kern_geom_sched); -static SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, - "GEOM_SCHED ROUND ROBIN stuff"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, - &me.units, 0, "Scheduler instances"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, - &me.queues, 0, "Total rr queues"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, - &me.wait_ms.x_cur, 0, "Wait time milliseconds"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, - &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, - &me.bypass, 0, "Bypass scheduler"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, - &me.w_anticipate, 0, "Do anticipation on writes"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, - &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, - &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, - &me.wait_hit, 0, "Hits in anticipation"); -SYSCTL_INT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, - &me.wait_miss, 0, "Misses in anticipation"); - -#ifdef DEBUG_QUEUES -/* print the status of a queue */ -static void -gs_rr_dump_q(struct g_rr_queue *qp, int index) -{ - int l = 0; - struct bio *bp; - - TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { - l++; - } - printf("--- rr queue %d %p status %d len %d ---\n", - index, qp, qp->q_status, l); -} - -/* - * Dump the scheduler status when writing to this sysctl variable. - * XXX right now we only dump the status of the last instance created. - * not a severe issue because this is only for debugging - */ -static int -gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) -{ - int error, val = 0; - struct g_rr_softc *sc; - - error = sysctl_handle_int(oidp, &val, 0, req); - if (error || !req->newptr ) - return (error); - - printf("called %s\n", __FUNCTION__); - - LIST_FOREACH(sc, &me.sc_head, sc_next) { - int i, tot = 0; - printf("--- sc %p active %p nqueues %d " - "callout %d in_flight %d ---\n", - sc, sc->sc_active, sc->sc_nqueues, - callout_active(&sc->sc_wait), - sc->sc_in_flight); - for (i = 0; i < G_RR_HASH_SIZE; i++) { - struct g_rr_queue *qp; - LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { - gs_rr_dump_q(qp, tot); - tot++; - } - } - } - return (0); -} - -SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, - CTLTYPE_UINT | CTLFLAG_RW, - 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); - -#endif /* DEBUG_QUEUES */ - -/* - * Get a bounded value, optionally convert to a min of t_min ticks. - */ -static int -get_bounded(struct x_bound *v, int t_min) -{ - int x; - - x = v->x_cur; - if (x < v->x_min) - x = v->x_min; - else if (x > v->x_max) - x = v->x_max; - if (t_min) { - x = x * hz / 1000; /* convert to ticks */ - if (x < t_min) - x = t_min; - } - return x; -} - -/* - * Get a reference to the queue for bp, using the generic - * classification mechanism. - */ -static struct g_rr_queue * -g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) -{ - - return (g_sched_get_class(sc->sc_geom, bp)); -} - -static int -g_rr_init_class(void *data, void *priv) -{ - struct g_rr_softc *sc = data; - struct g_rr_queue *qp = priv; - - bioq_init(&qp->q_bioq); - - /* - * Set the initial parameters for the client: - * slice size in bytes and ticks, and wait ticks. - * Right now these are constant, but we could have - * autoconfiguration code to adjust the values based on - * the actual workload. - */ - qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); - qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); - qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); - - qp->q_sc = sc; /* link to the parent */ - qp->q_sc->sc_nqueues++; - me.queues++; - - return (0); -} - -/* - * Release a reference to the queue. - */ -static void -g_rr_queue_put(struct g_rr_queue *qp) -{ - - g_sched_put_class(qp->q_sc->sc_geom, qp); -} - -static void -g_rr_fini_class(void *data, void *priv) -{ - struct g_rr_queue *qp = priv; - - KASSERT(bioq_first(&qp->q_bioq) == NULL, - ("released nonempty queue")); - qp->q_sc->sc_nqueues--; - me.queues--; -} - -static inline int -g_rr_queue_expired(struct g_rr_queue *qp) -{ - - if (qp->q_service >= qp->q_budget) - return (1); - - if ((qp->q_flags & G_FLAG_COMPLETED) && - ticks - qp->q_slice_end >= 0) - return (1); - - return (0); -} - -static inline int -g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) -{ - int wait = get_bounded(&me.wait_ms, 2); - - if (!me.w_anticipate && (bp->bio_cmd == BIO_WRITE)) - return (0); - - if (g_savg_valid(&qp->q_thinktime) && - g_savg_read(&qp->q_thinktime) > wait) - return (0); - - if (g_savg_valid(&qp->q_seekdist) && - g_savg_read(&qp->q_seekdist) > 8192) - return (0); - - return (1); -} - -/* - * Called on a request arrival, timeout or completion. - * Try to serve a request among those queued. - */ -static struct bio * -g_rr_next(void *data, int force) -{ - struct g_rr_softc *sc = data; - struct g_rr_queue *qp; - struct bio *bp, *next; - int expired; - - qp = sc->sc_active; - if (me.bypass == 0 && !force) { - if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) - return (NULL); - - /* Try with the queue under service first. */ - if (qp != NULL && qp->q_status != G_QUEUE_READY) { - /* - * Queue is anticipating, ignore request. - * We should check that we are not past - * the timeout, but in that case the timeout - * will fire immediately afterwards so we - * don't bother. - */ - return (NULL); - } - } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { - g_rr_queue_put(qp); - sc->sc_active = qp = NULL; - } - - /* - * No queue under service, look for the first in RR order. - * If we find it, select if as sc_active, clear service - * and record the end time of the slice. - */ - if (qp == NULL) { - qp = TAILQ_FIRST(&sc->sc_rr_tailq); - if (qp == NULL) - return (NULL); /* no queues at all, return */ - /* otherwise select the new queue for service. */ - TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); - sc->sc_active = qp; - qp->q_service = 0; - qp->q_flags &= ~G_FLAG_COMPLETED; - } - - bp = bioq_takefirst(&qp->q_bioq); /* surely not NULL */ - qp->q_service += bp->bio_length; /* charge the service */ - - /* - * The request at the head of the active queue is always - * dispatched, and gs_rr_next() will be called again - * immediately. - * We need to prepare for what to do next: - * - * 1. have we reached the end of the (time or service) slice ? - * If so, clear sc_active and possibly requeue the previous - * active queue if it has more requests pending; - * 2. do we have more requests in sc_active ? - * If yes, do not anticipate, as gs_rr_next() will run again; - * if no, decide whether or not to anticipate depending - * on read or writes (e.g., anticipate only on reads). - */ - expired = g_rr_queue_expired(qp); /* are we expired ? */ - next = bioq_first(&qp->q_bioq); /* do we have one more ? */ - if (expired) { - sc->sc_active = NULL; - /* Either requeue or release reference. */ - if (next != NULL) - TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); - else - g_rr_queue_put(qp); - } else if (next != NULL) { - qp->q_status = G_QUEUE_READY; - } else { - if (!force && g_rr_should_anticipate(qp, bp)) { - /* anticipate */ - qp->q_status = G_QUEUE_BUSY; - } else { - /* do not anticipate, release reference */ - g_rr_queue_put(qp); - sc->sc_active = NULL; - } - } - /* If sc_active != NULL, its q_status is always correct. */ - - sc->sc_in_flight++; - - return (bp); -} - -static inline void -g_rr_update_thinktime(struct g_rr_queue *qp) -{ - int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); - - if (qp->q_sc->sc_active != qp) - return; - - qp->q_lastsub = ticks; - delta = (delta > 2 * wait) ? 2 * wait : delta; - if (qp->q_bionum > 7) - g_savg_add_sample(&qp->q_thinktime, delta); -} - -static inline void -g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) -{ - off_t dist; - - if (qp->q_lastoff > bp->bio_offset) - dist = qp->q_lastoff - bp->bio_offset; - else - dist = bp->bio_offset - qp->q_lastoff; - - if (dist > (8192 * 8)) - dist = 8192 * 8; - - qp->q_lastoff = bp->bio_offset + bp->bio_length; - - if (qp->q_bionum > 7) - g_savg_add_sample(&qp->q_seekdist, dist); -} - -/* - * Called when a real request for disk I/O arrives. - * Locate the queue associated with the client. - * If the queue is the one we are anticipating for, reset its timeout; - * if the queue is not in the round robin list, insert it in the list. - * On any error, do not queue the request and return -1, the caller - * will take care of this request. - */ -static int -g_rr_start(void *data, struct bio *bp) -{ - struct g_rr_softc *sc = data; - struct g_rr_queue *qp; - - if (me.bypass) - return (-1); /* bypass the scheduler */ - - /* Get the queue for the request. */ - qp = g_rr_queue_get(sc, bp); - if (qp == NULL) - return (-1); /* allocation failed, tell upstream */ - - if (bioq_first(&qp->q_bioq) == NULL) { - /* - * We are inserting into an empty queue. - * Reset its state if it is sc_active, - * otherwise insert it in the RR list. - */ - if (qp == sc->sc_active) { - qp->q_status = G_QUEUE_READY; - callout_stop(&sc->sc_wait); - } else { - g_sched_priv_ref(qp); - TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); - } - } - - qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); - - g_rr_update_thinktime(qp); - g_rr_update_seekdist(qp, bp); - - /* Inherit the reference returned by g_rr_queue_get(). */ - bp->bio_caller1 = qp; - bioq_disksort(&qp->q_bioq, bp); - - return (0); -} - -/* - * Callout executed when a queue times out anticipating a new request. - */ -static void -g_rr_wait_timeout(void *data) -{ - struct g_rr_softc *sc = data; - struct g_geom *geom = sc->sc_geom; - - g_sched_lock(geom); - /* - * We can race with other events, so check if - * sc_active is still valid. - */ - if (sc->sc_active != NULL) { - /* Release the reference to the queue. */ - g_rr_queue_put(sc->sc_active); - sc->sc_active = NULL; - me.wait_hit--; - me.wait_miss++; /* record the miss */ - } - g_sched_dispatch(geom); - g_sched_unlock(geom); -} - -/* - * Module glue: allocate descriptor, initialize its fields. - */ -static void * -g_rr_init(struct g_geom *geom) -{ - struct g_rr_softc *sc; - - /* XXX check whether we can sleep */ - sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); - sc->sc_geom = geom; - TAILQ_INIT(&sc->sc_rr_tailq); - callout_init(&sc->sc_wait, 1); - LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); - me.units++; - - return (sc); -} - -/* - * Module glue -- drain the callout structure, destroy the - * hash table and its element, and free the descriptor. - */ -static void -g_rr_fini(void *data) -{ - struct g_rr_softc *sc = data; - - callout_drain(&sc->sc_wait); - KASSERT(sc->sc_active == NULL, ("still a queue under service")); - KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); - - LIST_REMOVE(sc, sc_next); - me.units--; - free(sc, M_GEOM_SCHED); -} - -/* - * Called when the request under service terminates. - * Start the anticipation timer if needed. - */ -static void -g_rr_done(void *data, struct bio *bp) -{ - struct g_rr_softc *sc = data; - struct g_rr_queue *qp; - - sc->sc_in_flight--; - - qp = bp->bio_caller1; - - /* - * When the first request for this queue completes, update the - * duration and end of the slice. We do not do it when the - * slice starts to avoid charging to the queue the time for - * the first seek. - */ - if (!(qp->q_flags & G_FLAG_COMPLETED)) { - qp->q_flags |= G_FLAG_COMPLETED; - /* - * recompute the slice duration, in case we want - * to make it adaptive. This is not used right now. - * XXX should we do the same for q_quantum and q_wait_ticks ? - */ - qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); - qp->q_slice_end = ticks + qp->q_slice_duration; - } - - if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { - /* The queue is trying anticipation, start the timer. */ - qp->q_status = G_QUEUE_IDLING; - /* may make this adaptive */ - qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); - me.wait_hit++; - callout_reset(&sc->sc_wait, qp->q_wait_ticks, - g_rr_wait_timeout, sc); - } else - g_sched_dispatch(sc->sc_geom); - - /* Release a reference to the queue. */ - g_rr_queue_put(qp); -} - -static void -g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, - struct g_consumer *cp, struct g_provider *pp) -{ - if (indent == NULL) { /* plaintext */ - sbuf_printf(sb, " units %d queues %d", - me.units, me.queues); - } -} - -static struct g_gsched g_rr = { - .gs_name = "rr", - .gs_priv_size = sizeof(struct g_rr_queue), - .gs_init = g_rr_init, - .gs_fini = g_rr_fini, - .gs_start = g_rr_start, - .gs_done = g_rr_done, - .gs_next = g_rr_next, - .gs_dumpconf = g_rr_dumpconf, - .gs_init_class = g_rr_init_class, - .gs_fini_class = g_rr_fini_class, -}; - -DECLARE_GSCHED_MODULE(rr, &g_rr); Property changes on: head/sys/geom/sched/gs_rr.c ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/geom/sched/gs_delay.c =================================================================== --- head/sys/geom/sched/gs_delay.c (revision 356184) +++ head/sys/geom/sched/gs_delay.c (nonexistent) @@ -1,264 +0,0 @@ -/*- - * Copyright (c) 2015 Netflix, Inc. - * - * Derived from gs_rr.c: - * Copyright (c) 2009-2010 Fabio Checconi - * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $Id$ - * $FreeBSD$ - * - * A simple scheduler that just delays certain transactions by a certain - * amount. We collect all the transactions that are 'done' and put them on - * a queue. The queue is run through every so often and the transactions that - * have taken longer than the threshold delay are completed. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "gs_scheduler.h" - -/* Useful constants */ -#define BTFRAC_1US 18446744073709ULL /* 2^64 / 1000000 */ - -/* list of scheduler instances */ -LIST_HEAD(g_scheds, g_delay_softc); - -/* - * Per device descriptor, holding the Round Robin list of queues - * accessing the disk, a reference to the geom, and the timer. - */ -struct g_delay_softc { - struct g_geom *sc_geom; - - struct bio_queue_head sc_bioq; /* queue of pending requests */ - struct callout sc_wait; /* timer for completing with delays */ - - /* Statistics */ - int sc_in_flight; /* requests in the driver */ -}; - -/* - * parameters, config and stats - */ -struct g_delay_params { - uint64_t io; - int bypass; /* bypass scheduling */ - int units; /* how many instances */ - int latency; /* How big a latncy are hoping for */ -}; - -static struct g_delay_params me = { - .bypass = 0, - .units = 0, - .latency = 0, - .io = 0, -}; -struct g_delay_params *gs_delay_me = &me; - -SYSCTL_DECL(_kern_geom_sched); -static SYSCTL_NODE(_kern_geom_sched, OID_AUTO, delay, CTLFLAG_RW, 0, - "GEOM_SCHED DELAY stuff"); -SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, bypass, CTLFLAG_RD, - &me.bypass, 0, "Scheduler bypass"); -SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, units, CTLFLAG_RD, - &me.units, 0, "Scheduler instances"); -SYSCTL_INT(_kern_geom_sched_delay, OID_AUTO, latency, CTLFLAG_RW, - &me.latency, 0, "Minimum latency for requests, in microseconds (1/hz resolution)"); -SYSCTL_QUAD(_kern_geom_sched_delay, OID_AUTO, io, CTLFLAG_RW, - &me.io, 0, "I/Os delayed\n"); - -static int -g_delay_init_class(void *data, void *priv) -{ - return (0); -} - -static void -g_delay_fini_class(void *data, void *priv) -{ -} - -/* - * Called on a request arrival, timeout or completion. - * Try to serve a request among those queued. - */ -static struct bio * -g_delay_next(void *data, int force) -{ - struct g_delay_softc *sc = data; - struct bio *bp; - struct bintime bt; - - bp = bioq_first(&sc->sc_bioq); - if (bp == NULL) - return (NULL); - - /* - * If the time isn't yet ripe for this bp to be let loose, - * then the time isn't ripe for any of its friends either - * since we insert in-order. Terminate if the bio hasn't - * aged appropriately. Note that there's pathology here - * such that we may be up to one tick early in releasing - * this I/O. We could implement this up to a tick late too - * but choose not to. - */ - getbinuptime(&bt); /* BIO's bio_t0 is uptime */ - if (bintime_cmp(&bp->bio_t0, &bt, >)) - return (NULL); - me.io++; - - /* - * The bp has mellowed enough, let it through and update stats. - * If there's others, we'll catch them next time we get called. - */ - sc->sc_in_flight++; - - bp = bioq_takefirst(&sc->sc_bioq); - return (bp); -} - -/* - * Called when a real request for disk I/O arrives. - * Locate the queue associated with the client. - * If the queue is the one we are anticipating for, reset its timeout; - * if the queue is not in the round robin list, insert it in the list. - * On any error, do not queue the request and return -1, the caller - * will take care of this request. - */ -static int -g_delay_start(void *data, struct bio *bp) -{ - struct g_delay_softc *sc = data; - - if (me.bypass) - return (-1); /* bypass the scheduler */ - - bp->bio_caller1 = sc; - getbinuptime(&bp->bio_t0); /* BIO's bio_t0 is uptime */ - bintime_addx(&bp->bio_t0, BTFRAC_1US * me.latency); - - /* - * Keep the I/Os ordered. Lower layers will reorder as we release them down. - * We rely on this in g_delay_next() so that we delay all things equally. Even - * if we move to multiple queues to push stuff down the stack, we'll want to - * insert in order and let the lower layers do whatever reordering they want. - */ - bioq_insert_tail(&sc->sc_bioq, bp); - - return (0); -} - -static void -g_delay_timeout(void *data) -{ - struct g_delay_softc *sc = data; - - g_sched_lock(sc->sc_geom); - g_sched_dispatch(sc->sc_geom); - g_sched_unlock(sc->sc_geom); - callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc); -} - -/* - * Module glue: allocate descriptor, initialize its fields. - */ -static void * -g_delay_init(struct g_geom *geom) -{ - struct g_delay_softc *sc; - - sc = malloc(sizeof *sc, M_GEOM_SCHED, M_WAITOK | M_ZERO); - sc->sc_geom = geom; - bioq_init(&sc->sc_bioq); - callout_init(&sc->sc_wait, CALLOUT_MPSAFE); - callout_reset(&sc->sc_wait, 1, g_delay_timeout, sc); - me.units++; - - return (sc); -} - -/* - * Module glue -- drain the callout structure, destroy the - * hash table and its element, and free the descriptor. - */ -static void -g_delay_fini(void *data) -{ - struct g_delay_softc *sc = data; - - /* We're force drained before getting here */ - - /* Kick out timers */ - callout_drain(&sc->sc_wait); - me.units--; - free(sc, M_GEOM_SCHED); -} - -/* - * Called when the request under service terminates. - * Start the anticipation timer if needed. - */ -static void -g_delay_done(void *data, struct bio *bp) -{ - struct g_delay_softc *sc = data; - - sc->sc_in_flight--; - - g_sched_dispatch(sc->sc_geom); -} - -static void -g_delay_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, - struct g_consumer *cp, struct g_provider *pp) -{ -} - -static struct g_gsched g_delay = { - .gs_name = "delay", - .gs_priv_size = 0, - .gs_init = g_delay_init, - .gs_fini = g_delay_fini, - .gs_start = g_delay_start, - .gs_done = g_delay_done, - .gs_next = g_delay_next, - .gs_dumpconf = g_delay_dumpconf, - .gs_init_class = g_delay_init_class, - .gs_fini_class = g_delay_fini_class, -}; - -DECLARE_GSCHED_MODULE(delay, &g_delay); Property changes on: head/sys/geom/sched/gs_delay.c ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: head/sys/geom/sched/g_sched.c =================================================================== --- head/sys/geom/sched/g_sched.c (revision 356184) +++ head/sys/geom/sched/g_sched.c (nonexistent) @@ -1,1729 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2009-2010 Fabio Checconi - * Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $Id$ - * $FreeBSD$ - * - * Main control module for geom-based disk schedulers ('sched'). - * - * USER VIEW - * A 'sched' node is typically inserted transparently between - * an existing provider pp and its original geom gp - * - * [pp --> gp ..] - * - * using the command "geom sched insert " and - * resulting in the following topology - * - * [pp --> sched_gp --> cp] [new_pp --> gp ... ] - * - * Deletion "geom sched destroy .sched." restores the - * original chain. The normal "geom sched create " - * is also supported. - * - * INTERNALS - * Internally, the 'sched' uses the following data structures - * - * geom{} g_sched_softc{} g_gsched{} - * +----------+ +---------------+ +-------------+ - * | softc *-|--->| sc_gsched *-|-->| gs_init | - * | ... | | | | gs_fini | - * | | | [ hash table] | | gs_start | - * +----------+ | | | ... | - * | | +-------------+ - * | | - * | | g_*_softc{} - * | | +-------------+ - * | sc_data *-|-->| | - * +---------------+ | algorithm- | - * | specific | - * +-------------+ - * - * A g_sched_softc{} is created with a "geom sched insert" call. - * In turn this instantiates a specific scheduling algorithm, - * which sets sc_gsched to point to the algorithm callbacks, - * and calls gs_init() to create the g_*_softc{} . - * The other callbacks (gs_start, gs_next, ...) are invoked - * as needed - * - * g_sched_softc{} is defined in g_sched.h and mostly used here; - * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; - * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) - * - * DATA MOVING - * When a bio is received on the provider, it goes to the - * g_sched_start() which calls gs_start() to initially queue it; - * then we call g_sched_dispatch() that loops around gs_next() - * to select zero or more bio's to be sent downstream. - * - * g_sched_dispatch() can also be called as a result of a timeout, - * e.g. when doing anticipation or pacing requests. - * - * When a bio comes back, it goes to g_sched_done() which in turn - * calls gs_done(). The latter does any necessary housekeeping in - * the scheduling algorithm, and may decide to call g_sched_dispatch() - * to send more bio's downstream. - * - * If an algorithm needs per-flow queues, these are created - * calling gs_init_class() and destroyed with gs_fini_class(), - * and they are also inserted in the hash table implemented in - * the g_sched_softc{} - * - * If an algorithm is replaced, or a transparently-inserted node is - * removed with "geom sched destroy", we need to remove all references - * to the g_*_softc{} and g_sched_softc from the bio's still in - * the scheduler. g_sched_forced_dispatch() helps doing this. - * XXX need to explain better. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* we access curthread */ -#include -#include -#include "gs_scheduler.h" -#include "g_sched.h" /* geom hooks */ - -/* - * Size of the per-geom hash table storing traffic classes. - * We may decide to change it at a later time, it has no ABI - * implications as it is only used for run-time allocations. - */ -#define G_SCHED_HASH_SIZE 32 - -static int g_sched_destroy(struct g_geom *gp, boolean_t force); -static int g_sched_destroy_geom(struct gctl_req *req, - struct g_class *mp, struct g_geom *gp); -static void g_sched_config(struct gctl_req *req, struct g_class *mp, - const char *verb); -static struct g_geom *g_sched_taste(struct g_class *mp, - struct g_provider *pp, int flags __unused); -static void g_sched_dumpconf(struct sbuf *sb, const char *indent, - struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); -static void g_sched_init(struct g_class *mp); -static void g_sched_fini(struct g_class *mp); -static int g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, - int fflag, struct thread *td); - -struct g_class g_sched_class = { - .name = G_SCHED_CLASS_NAME, - .version = G_VERSION, - .ctlreq = g_sched_config, - .taste = g_sched_taste, - .destroy_geom = g_sched_destroy_geom, - .init = g_sched_init, - .ioctl = g_sched_ioctl, - .fini = g_sched_fini -}; - -MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); - -/* - * Global variables describing the state of the geom_sched module. - * There is only one static instance of this structure. - */ -LIST_HEAD(gs_list, g_gsched); /* type, link field */ -struct geom_sched_vars { - struct mtx gs_mtx; - struct gs_list gs_scheds; /* list of algorithms */ - u_int gs_debug; - u_int gs_sched_count; /* how many algorithms ? */ - u_int gs_patched; /* g_io_request was patched */ - - u_int gs_initialized; - u_int gs_expire_secs; /* expiration of hash entries */ - - struct bio_queue_head gs_pending; - u_int gs_npending; - - /* The following are for stats, usually protected by gs_mtx. */ - u_long gs_requests; /* total requests */ - u_long gs_done; /* total done */ - u_int gs_in_flight; /* requests in flight */ - u_int gs_writes_in_flight; - u_int gs_bytes_in_flight; - u_int gs_write_bytes_in_flight; - - char gs_names[256]; /* names of schedulers */ -}; - -static struct geom_sched_vars me = { - .gs_expire_secs = 10, -}; - -SYSCTL_DECL(_kern_geom); -SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, - "GEOM_SCHED stuff"); - -SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, - &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); - -SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, - &me.gs_bytes_in_flight, 0, "Bytes in flight"); - -SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, - &me.gs_writes_in_flight, 0, "Write Requests in flight"); - -SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, - &me.gs_in_flight, 0, "Requests in flight"); - -SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, - &me.gs_done, 0, "Total done"); - -SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, - &me.gs_requests, 0, "Total requests"); - -SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, - &me.gs_names, 0, "Algorithm names"); - -SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, - &me.gs_sched_count, 0, "Number of algorithms"); - -SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, - &me.gs_debug, 0, "Debug level"); - -SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, - &me.gs_expire_secs, 0, "Expire time in seconds"); - -/* - * g_sched calls the scheduler algorithms with this lock held. - * The locking functions are exposed so the scheduler algorithms can also - * protect themselves e.g. when running a callout handler. - */ -void -g_sched_lock(struct g_geom *gp) -{ - struct g_sched_softc *sc = gp->softc; - - mtx_lock(&sc->sc_mtx); -} - -void -g_sched_unlock(struct g_geom *gp) -{ - struct g_sched_softc *sc = gp->softc; - - mtx_unlock(&sc->sc_mtx); -} - -/* - * Support functions to handle references to the module, - * which are coming from devices using this scheduler. - */ -static inline void -g_gsched_ref(struct g_gsched *gsp) -{ - - atomic_add_int(&gsp->gs_refs, 1); -} - -static inline void -g_gsched_unref(struct g_gsched *gsp) -{ - - atomic_add_int(&gsp->gs_refs, -1); -} - -/* - * Update the stats when this request is done. - */ -static void -g_sched_update_stats(struct bio *bio) -{ - - me.gs_done++; - me.gs_in_flight--; - me.gs_bytes_in_flight -= bio->bio_length; - if (bio->bio_cmd == BIO_WRITE) { - me.gs_writes_in_flight--; - me.gs_write_bytes_in_flight -= bio->bio_length; - } -} - -/* - * Dispatch any pending request. - */ -static void -g_sched_forced_dispatch(struct g_geom *gp) -{ - struct g_sched_softc *sc = gp->softc; - struct g_gsched *gsp = sc->sc_gsched; - struct bio *bp; - - KASSERT(mtx_owned(&sc->sc_mtx), - ("sc_mtx not owned during forced dispatch")); - - while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) - g_io_request(bp, LIST_FIRST(&gp->consumer)); -} - -/* - * The main dispatch loop, called either here after the start - * routine, or by scheduling algorithms when they receive a timeout - * or a 'done' notification. Does not share code with the forced - * dispatch path, since the gs_done() callback can call us. - */ -void -g_sched_dispatch(struct g_geom *gp) -{ - struct g_sched_softc *sc = gp->softc; - struct g_gsched *gsp = sc->sc_gsched; - struct bio *bp; - - KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); - - if ((sc->sc_flags & G_SCHED_FLUSHING)) - return; - - while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) - g_io_request(bp, LIST_FIRST(&gp->consumer)); -} - -/* - * Recent (8.0 and above) versions of FreeBSD have support to - * register classifiers of disk requests. The classifier is - * invoked by g_io_request(), and stores the information into - * bp->bio_classifier1. - * - * Support for older versions, which is left here only for - * documentation purposes, relies on two hacks: - * 1. classification info is written into the bio_caller1 - * field of the topmost node in the bio chain. This field - * is rarely used, but this module is incompatible with - * those that use bio_caller1 for other purposes, - * such as ZFS and gjournal; - * 2. g_io_request() is patched in-memory when the module is - * loaded, so that the function calls a classifier as its - * first thing. g_io_request() is restored when the module - * is unloaded. This functionality is only supported for - * x86 and amd64, other architectures need source code changes. - */ - -/* - * Lookup the identity of the issuer of the original request. - * In the current implementation we use the curthread of the - * issuer, but different mechanisms may be implemented later - * so we do not make assumptions on the return value which for - * us is just an opaque identifier. - */ - -static inline u_long -g_sched_classify(struct bio *bp) -{ - - /* we have classifier fields in the struct bio */ - return ((u_long)bp->bio_classifier1); -} - -/* Return the hash chain for the given key. */ -static inline struct g_hash * -g_sched_hash(struct g_sched_softc *sc, u_long key) -{ - - return (&sc->sc_hash[key & sc->sc_mask]); -} - -/* - * Helper function for the children classes, which takes - * a geom and a bio and returns the private descriptor - * associated to the request. This involves fetching - * the classification field and [al]locating the - * corresponding entry in the hash table. - */ -void * -g_sched_get_class(struct g_geom *gp, struct bio *bp) -{ - struct g_sched_softc *sc; - struct g_sched_class *gsc; - struct g_gsched *gsp; - struct g_hash *bucket; - u_long key; - - sc = gp->softc; - key = g_sched_classify(bp); - bucket = g_sched_hash(sc, key); - LIST_FOREACH(gsc, bucket, gsc_clist) { - if (key == gsc->gsc_key) { - gsc->gsc_refs++; - return (gsc->gsc_priv); - } - } - - gsp = sc->sc_gsched; - gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, - M_GEOM_SCHED, M_NOWAIT | M_ZERO); - if (!gsc) - return (NULL); - - if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { - free(gsc, M_GEOM_SCHED); - return (NULL); - } - - gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ - gsc->gsc_key = key; - LIST_INSERT_HEAD(bucket, gsc, gsc_clist); - - gsc->gsc_expire = ticks + me.gs_expire_secs * hz; - - return (gsc->gsc_priv); -} - -/* - * Release a reference to the per-client descriptor, - */ -void -g_sched_put_class(struct g_geom *gp, void *priv) -{ - struct g_sched_class *gsc; - struct g_sched_softc *sc; - - gsc = g_sched_priv2class(priv); - gsc->gsc_expire = ticks + me.gs_expire_secs * hz; - - if (--gsc->gsc_refs > 0) - return; - - sc = gp->softc; - sc->sc_gsched->gs_fini_class(sc->sc_data, priv); - - LIST_REMOVE(gsc, gsc_clist); - free(gsc, M_GEOM_SCHED); -} - -static void -g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, - struct g_gsched *gsp, void *data) -{ - struct g_sched_class *cp, *cp2; - int i; - - if (!hp) - return; - - if (data && gsp->gs_hash_unref) - gsp->gs_hash_unref(data); - - for (i = 0; i < G_SCHED_HASH_SIZE; i++) { - LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) - g_sched_put_class(gp, cp->gsc_priv); - } - - hashdestroy(hp, M_GEOM_SCHED, mask); -} - -static struct g_hash * -g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) -{ - struct g_hash *hash; - - if (gsp->gs_priv_size == 0) - return (NULL); - - hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); - - return (hash); -} - -static void -g_sched_flush_classes(struct g_geom *gp) -{ - struct g_sched_softc *sc; - struct g_sched_class *cp, *cp2; - int i; - - sc = gp->softc; - - if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) - return; - - for (i = 0; i < G_SCHED_HASH_SIZE; i++) { - LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { - if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) - g_sched_put_class(gp, cp->gsc_priv); - } - } - - sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; -} - -/* - * Wait for the completion of any outstanding request. To ensure - * that this does not take forever the caller has to make sure that - * no new request enter the scehduler before calling us. - * - * Must be called with the gp mutex held and topology locked. - */ -static int -g_sched_wait_pending(struct g_geom *gp) -{ - struct g_sched_softc *sc = gp->softc; - int endticks = ticks + hz; - - g_topology_assert(); - - while (sc->sc_pending && endticks - ticks >= 0) - msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); - - return (sc->sc_pending ? ETIMEDOUT : 0); -} - -static int -g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) -{ - struct g_sched_softc *sc = gp->softc; - int error; - - /* Set the flushing flag: new bios will not enter the scheduler. */ - sc->sc_flags |= G_SCHED_FLUSHING; - - g_sched_forced_dispatch(gp); - error = g_sched_wait_pending(gp); - if (error) - goto failed; - - /* No more requests pending or in flight from the old gsp. */ - - g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); - sc->sc_hash = NULL; - - /* - * Avoid deadlock here by releasing the gp mutex and reacquiring - * it once done. It should be safe, since no reconfiguration or - * destruction can take place due to the geom topology lock; no - * new request can use the current sc_data since we flagged the - * geom as being flushed. - */ - g_sched_unlock(gp); - gsp->gs_fini(sc->sc_data); - g_sched_lock(gp); - - sc->sc_gsched = NULL; - sc->sc_data = NULL; - g_gsched_unref(gsp); - -failed: - sc->sc_flags &= ~G_SCHED_FLUSHING; - - return (error); -} - -static int -g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) -{ - int error; - - g_sched_lock(gp); - error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ - g_sched_unlock(gp); - - return (error); -} - -/* - * Support function for create/taste -- locate the desired - * algorithm and grab a reference to it. - */ -static struct g_gsched * -g_gsched_find(const char *name) -{ - struct g_gsched *gsp = NULL; - - mtx_lock(&me.gs_mtx); - LIST_FOREACH(gsp, &me.gs_scheds, glist) { - if (strcmp(name, gsp->gs_name) == 0) { - g_gsched_ref(gsp); - break; - } - } - mtx_unlock(&me.gs_mtx); - - return (gsp); -} - -/* - * Rebuild the list of scheduler names. - * To be called with me.gs_mtx lock held. - */ -static void -g_gsched_build_names(struct g_gsched *gsp) -{ - int pos, l; - struct g_gsched *cur; - - pos = 0; - LIST_FOREACH(cur, &me.gs_scheds, glist) { - l = strlen(cur->gs_name); - if (l + pos + 1 + 1 < sizeof(me.gs_names)) { - if (pos != 0) - me.gs_names[pos++] = ' '; - strcpy(me.gs_names + pos, cur->gs_name); - pos += l; - } - } - me.gs_names[pos] = '\0'; -} - -/* - * Register or unregister individual scheduling algorithms. - */ -static int -g_gsched_register(struct g_gsched *gsp) -{ - struct g_gsched *cur; - int error = 0; - - mtx_lock(&me.gs_mtx); - LIST_FOREACH(cur, &me.gs_scheds, glist) { - if (strcmp(gsp->gs_name, cur->gs_name) == 0) - break; - } - if (cur != NULL) { - G_SCHED_DEBUG(0, "A scheduler named %s already" - "exists.", gsp->gs_name); - error = EEXIST; - } else { - LIST_INSERT_HEAD(&me.gs_scheds, gsp, glist); - gsp->gs_refs = 1; - me.gs_sched_count++; - g_gsched_build_names(gsp); - } - mtx_unlock(&me.gs_mtx); - - return (error); -} - -struct g_gsched_unregparm { - struct g_gsched *gup_gsp; - int gup_error; -}; - -static void -g_gsched_unregister(void *arg, int flag) -{ - struct g_gsched_unregparm *parm = arg; - struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; - struct g_sched_softc *sc; - struct g_geom *gp, *gp_tmp; - int error; - - parm->gup_error = 0; - - g_topology_assert(); - - if (flag == EV_CANCEL) - return; - - mtx_lock(&me.gs_mtx); - - LIST_FOREACH_SAFE(gp, &g_sched_class.geom, geom, gp_tmp) { - if (gp->class != &g_sched_class) - continue; /* Should not happen. */ - - sc = gp->softc; - if (sc->sc_gsched == gsp) { - error = g_sched_remove(gp, gsp); - if (error) - goto failed; - } - } - - LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { - if (cur != gsp) - continue; - - if (gsp->gs_refs != 1) { - G_SCHED_DEBUG(0, "%s still in use.", - gsp->gs_name); - parm->gup_error = EBUSY; - } else { - LIST_REMOVE(gsp, glist); - me.gs_sched_count--; - g_gsched_build_names(gsp); - } - break; - } - - if (cur == NULL) { - G_SCHED_DEBUG(0, "%s not registered.", gsp->gs_name); - parm->gup_error = ENOENT; - } - -failed: - mtx_unlock(&me.gs_mtx); -} - -static inline void -g_gsched_global_init(void) -{ - - if (!me.gs_initialized) { - G_SCHED_DEBUG(0, "Initializing global data."); - mtx_init(&me.gs_mtx, "gsched", NULL, MTX_DEF); - LIST_INIT(&me.gs_scheds); - bioq_init(&me.gs_pending); - me.gs_initialized = 1; - } -} - -/* - * Module event called when a scheduling algorithm module is loaded or - * unloaded. - */ -int -g_gsched_modevent(module_t mod, int cmd, void *arg) -{ - struct g_gsched *gsp = arg; - struct g_gsched_unregparm parm; - int error; - - G_SCHED_DEBUG(0, "Modevent %d.", cmd); - - /* - * If the module is loaded at boot, the geom thread that calls - * g_sched_init() might actually run after g_gsched_modevent(), - * so make sure that the module is properly initialized. - */ - g_gsched_global_init(); - - error = EOPNOTSUPP; - switch (cmd) { - case MOD_LOAD: - error = g_gsched_register(gsp); - G_SCHED_DEBUG(0, "Loaded module %s error %d.", - gsp->gs_name, error); - if (error == 0) - g_retaste(&g_sched_class); - break; - - case MOD_UNLOAD: - parm.gup_gsp = gsp; - parm.gup_error = 0; - - error = g_waitfor_event(g_gsched_unregister, - &parm, M_WAITOK, NULL); - if (error == 0) - error = parm.gup_error; - G_SCHED_DEBUG(0, "Unloaded module %s error %d.", - gsp->gs_name, error); - break; - } - - return (error); -} - -#ifdef KTR -#define TRC_BIO_EVENT(e, bp) g_sched_trace_bio_ ## e (bp) - -static inline char -g_sched_type(struct bio *bp) -{ - - if (bp->bio_cmd == BIO_READ) - return ('R'); - else if (bp->bio_cmd == BIO_WRITE) - return ('W'); - return ('U'); -} - -static inline void -g_sched_trace_bio_START(struct bio *bp) -{ - - CTR5(KTR_GSCHED, "S %lu %c %lu/%lu %lu", g_sched_classify(bp), - g_sched_type(bp), bp->bio_offset / ULONG_MAX, - bp->bio_offset, bp->bio_length); -} - -static inline void -g_sched_trace_bio_DONE(struct bio *bp) -{ - - CTR5(KTR_GSCHED, "D %lu %c %lu/%lu %lu", g_sched_classify(bp), - g_sched_type(bp), bp->bio_offset / ULONG_MAX, - bp->bio_offset, bp->bio_length); -} -#else /* !KTR */ -#define TRC_BIO_EVENT(e, bp) -#endif /* !KTR */ - -/* - * g_sched_done() and g_sched_start() dispatch the geom requests to - * the scheduling algorithm in use. - */ -static void -g_sched_done(struct bio *bio) -{ - struct g_geom *gp = bio->bio_caller2; - struct g_sched_softc *sc = gp->softc; - - TRC_BIO_EVENT(DONE, bio); - - KASSERT(bio->bio_caller1, ("null bio_caller1 in g_sched_done")); - - g_sched_lock(gp); - - g_sched_update_stats(bio); - sc->sc_gsched->gs_done(sc->sc_data, bio); - if (!--sc->sc_pending) - wakeup(gp); - - g_sched_flush_classes(gp); - g_sched_unlock(gp); - - g_std_done(bio); -} - -static void -g_sched_start(struct bio *bp) -{ - struct g_geom *gp = bp->bio_to->geom; - struct g_sched_softc *sc = gp->softc; - struct bio *cbp; - - TRC_BIO_EVENT(START, bp); - G_SCHED_LOGREQ(bp, "Request received."); - - cbp = g_clone_bio(bp); - if (cbp == NULL) { - g_io_deliver(bp, ENOMEM); - return; - } - cbp->bio_done = g_sched_done; - cbp->bio_to = LIST_FIRST(&gp->provider); - KASSERT(cbp->bio_to != NULL, ("NULL provider")); - - /* We only schedule reads and writes. */ - if (bp->bio_cmd != BIO_READ && bp->bio_cmd != BIO_WRITE) - goto bypass; - - G_SCHED_LOGREQ(cbp, "Sending request."); - - g_sched_lock(gp); - /* - * Call the algorithm's gs_start to queue the request in the - * scheduler. If gs_start fails then pass the request down, - * otherwise call g_sched_dispatch() which tries to push - * one or more requests down. - */ - if (!sc->sc_gsched || (sc->sc_flags & G_SCHED_FLUSHING) || - sc->sc_gsched->gs_start(sc->sc_data, cbp)) { - g_sched_unlock(gp); - goto bypass; - } - /* - * We use bio_caller1 to mark requests that are scheduled - * so make sure it is not NULL. - */ - if (cbp->bio_caller1 == NULL) - cbp->bio_caller1 = &me; /* anything not NULL */ - - cbp->bio_caller2 = gp; - sc->sc_pending++; - - /* Update general stats. */ - me.gs_in_flight++; - me.gs_requests++; - me.gs_bytes_in_flight += bp->bio_length; - if (bp->bio_cmd == BIO_WRITE) { - me.gs_writes_in_flight++; - me.gs_write_bytes_in_flight += bp->bio_length; - } - g_sched_dispatch(gp); - g_sched_unlock(gp); - return; - -bypass: - cbp->bio_done = g_std_done; - cbp->bio_caller1 = NULL; /* not scheduled */ - g_io_request(cbp, LIST_FIRST(&gp->consumer)); -} - -/* - * The next few functions are the geom glue. - */ -static void -g_sched_orphan(struct g_consumer *cp) -{ - - g_topology_assert(); - g_sched_destroy(cp->geom, 1); -} - -static int -g_sched_access(struct g_provider *pp, int dr, int dw, int de) -{ - struct g_geom *gp; - struct g_consumer *cp; - int error; - - gp = pp->geom; - cp = LIST_FIRST(&gp->consumer); - error = g_access(cp, dr, dw, de); - - return (error); -} - -static void -g_sched_temporary_start(struct bio *bio) -{ - - mtx_lock(&me.gs_mtx); - me.gs_npending++; - bioq_disksort(&me.gs_pending, bio); - mtx_unlock(&me.gs_mtx); -} - -static void -g_sched_flush_pending(g_start_t *start) -{ - struct bio *bp; - - while ((bp = bioq_takefirst(&me.gs_pending))) - start(bp); -} - -static int -g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, - struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) -{ - struct g_sched_softc *sc = gp->softc; - g_start_t *saved_start, *flush = g_sched_start; - int error = 0, endticks = ticks + hz; - - g_cancel_event(newpp); /* prevent taste() */ - /* copy private fields */ - newpp->private = pp->private; - newpp->index = pp->index; - - /* Queue all the early requests coming for us. */ - me.gs_npending = 0; - saved_start = pp->geom->start; - dstgp->start = g_sched_temporary_start; - - while (pp->nstart - pp->nend != me.gs_npending && - endticks - ticks >= 0) - tsleep(pp, PRIBIO, "-", hz/10); - - if (pp->nstart - pp->nend != me.gs_npending) { - flush = saved_start; - error = ETIMEDOUT; - goto fail; - } - - /* link pp to this geom */ - LIST_REMOVE(pp, provider); - pp->geom = gp; - LIST_INSERT_HEAD(&gp->provider, pp, provider); - - /* - * replicate the counts from the parent in the - * new provider and consumer nodes - */ - cp->acr = newpp->acr = pp->acr; - cp->acw = newpp->acw = pp->acw; - cp->ace = newpp->ace = pp->ace; - sc->sc_flags |= G_SCHED_PROXYING; - -fail: - dstgp->start = saved_start; - - g_sched_flush_pending(flush); - - return (error); -} - -/* - * Create a geom node for the device passed as *pp. - * If successful, add a reference to this gsp. - */ -static int -g_sched_create(struct gctl_req *req, struct g_class *mp, - struct g_provider *pp, struct g_gsched *gsp, int proxy) -{ - struct g_sched_softc *sc = NULL; - struct g_geom *gp, *dstgp; - struct g_provider *newpp = NULL; - struct g_consumer *cp = NULL; - char name[64]; - int error; - - g_topology_assert(); - - snprintf(name, sizeof(name), "%s%s", pp->name, G_SCHED_SUFFIX); - LIST_FOREACH(gp, &mp->geom, geom) { - if (strcmp(gp->name, name) == 0) { - gctl_error(req, "Geom %s already exists.", - name); - return (EEXIST); - } - } - - gp = g_new_geomf(mp, "%s", name); - dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ - - sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); - sc->sc_gsched = gsp; - sc->sc_data = gsp->gs_init(gp); - if (sc->sc_data == NULL) { - error = ENOMEM; - goto fail; - } - - sc->sc_hash = g_sched_hash_init(gsp, &sc->sc_mask, HASH_WAITOK); - - /* - * Do not initialize the flush mechanism, will be initialized - * on the first insertion on the hash table. - */ - - mtx_init(&sc->sc_mtx, "g_sched_mtx", NULL, MTX_DEF); - - gp->softc = sc; - gp->start = g_sched_start; - gp->orphan = g_sched_orphan; - gp->access = g_sched_access; - gp->dumpconf = g_sched_dumpconf; - - newpp = g_new_providerf(dstgp, "%s", gp->name); - newpp->mediasize = pp->mediasize; - newpp->sectorsize = pp->sectorsize; - - cp = g_new_consumer(gp); - error = g_attach(cp, proxy ? newpp : pp); - if (error != 0) { - gctl_error(req, "Cannot attach to provider %s.", - pp->name); - goto fail; - } - - g_error_provider(newpp, 0); - if (proxy) { - error = g_insert_proxy(gp, newpp, dstgp, pp, cp); - if (error) - goto fail; - } - G_SCHED_DEBUG(0, "Device %s created.", gp->name); - - g_gsched_ref(gsp); - - return (0); - -fail: - if (cp != NULL) { - if (cp->provider != NULL) - g_detach(cp); - g_destroy_consumer(cp); - } - if (newpp != NULL) - g_destroy_provider(newpp); - if (sc->sc_hash) - g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, - gsp, sc->sc_data); - if (sc->sc_data) - gsp->gs_fini(sc->sc_data); - g_free(gp->softc); - g_destroy_geom(gp); - - return (error); -} - -/* - * Support for dynamic switching of scheduling algorithms. - * First initialize the data structures for the new algorithm, - * then call g_sched_remove_locked() to flush all references - * to the old one, finally link the new algorithm. - */ -static int -g_sched_change_algo(struct gctl_req *req, struct g_class *mp, - struct g_provider *pp, struct g_gsched *gsp) -{ - struct g_sched_softc *sc; - struct g_geom *gp; - struct g_hash *newh; - void *data; - u_long mask; - int error = 0; - - gp = pp->geom; - sc = gp->softc; - - data = gsp->gs_init(gp); - if (data == NULL) - return (ENOMEM); - - newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); - if (gsp->gs_priv_size && !newh) { - error = ENOMEM; - goto fail; - } - - g_sched_lock(gp); - if (sc->sc_gsched) { /* can be NULL in some cases */ - error = g_sched_remove_locked(gp, sc->sc_gsched); - if (error) - goto fail; - } - - g_gsched_ref(gsp); - sc->sc_gsched = gsp; - sc->sc_data = data; - sc->sc_hash = newh; - sc->sc_mask = mask; - - g_sched_unlock(gp); - - return (0); - -fail: - if (newh) - g_sched_hash_fini(gp, newh, mask, gsp, data); - - if (data) - gsp->gs_fini(data); - - g_sched_unlock(gp); - - return (error); -} - -/* - * Stop the request flow directed to the proxy, redirecting the new - * requests to the me.gs_pending queue. - */ -static struct g_provider * -g_detach_proxy(struct g_geom *gp) -{ - struct g_consumer *cp; - struct g_provider *pp, *newpp; - - do { - pp = LIST_FIRST(&gp->provider); - if (pp == NULL) - break; - cp = LIST_FIRST(&gp->consumer); - if (cp == NULL) - break; - newpp = cp->provider; - if (newpp == NULL) - break; - - me.gs_npending = 0; - pp->geom->start = g_sched_temporary_start; - - return (pp); - } while (0); - printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); - - return (NULL); -} - -static void -g_sched_blackhole(struct bio *bp) -{ - - g_io_deliver(bp, ENXIO); -} - -static inline void -g_reparent_provider(struct g_provider *pp, struct g_geom *gp, - struct g_provider *newpp) -{ - - LIST_REMOVE(pp, provider); - if (newpp) { - pp->private = newpp->private; - pp->index = newpp->index; - } - pp->geom = gp; - LIST_INSERT_HEAD(&gp->provider, pp, provider); -} - -static inline void -g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) -{ - struct g_geom *gp = oldpp->geom; - - g_reparent_provider(oldpp, newpp->geom, newpp); - - /* - * Hackish: let the system destroy the old provider for us, just - * in case someone attached a consumer to it, in which case a - * direct call to g_destroy_provider() would not work. - */ - g_reparent_provider(newpp, gp, NULL); -} - -/* - * Complete the proxy destruction, linking the old provider to its - * original geom, and destroying the proxy provider. Also take care - * of issuing the pending requests collected in me.gs_pending (if any). - */ -static int -g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) -{ - struct g_consumer *cp; - struct g_provider *newpp; - - do { - cp = LIST_FIRST(&gp->consumer); - if (cp == NULL) - break; - newpp = cp->provider; - if (newpp == NULL) - break; - - /* Relink the provider to its original geom. */ - g_unproxy_provider(oldpp, newpp); - - /* Detach consumer from provider, and destroy provider. */ - cp->acr = newpp->acr = 0; - cp->acw = newpp->acw = 0; - cp->ace = newpp->ace = 0; - g_detach(cp); - - /* Send the pending bios through the right start function. */ - g_sched_flush_pending(oldpp->geom->start); - - return (0); - } while (0); - printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); - - /* We cannot send the pending bios anywhere... */ - g_sched_flush_pending(g_sched_blackhole); - - return (EINVAL); -} - -static int -g_sched_destroy(struct g_geom *gp, boolean_t force) -{ - struct g_provider *pp, *oldpp = NULL; - struct g_sched_softc *sc; - struct g_gsched *gsp; - int error; - - g_topology_assert(); - sc = gp->softc; - if (sc == NULL) - return (ENXIO); - if (!(sc->sc_flags & G_SCHED_PROXYING)) { - pp = LIST_FIRST(&gp->provider); - if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { - const char *msg = force ? - "but we force removal" : "cannot remove"; - - G_SCHED_DEBUG(!force, - "Device %s is still open (r%dw%de%d), %s.", - pp->name, pp->acr, pp->acw, pp->ace, msg); - if (!force) - return (EBUSY); - } else { - G_SCHED_DEBUG(0, "Device %s removed.", gp->name); - } - } else - oldpp = g_detach_proxy(gp); - - gsp = sc->sc_gsched; - if (gsp) { - /* - * XXX bad hack here: force a dispatch to release - * any reference to the hash table still held by - * the scheduler. - */ - g_sched_lock(gp); - /* - * We are dying here, no new requests should enter - * the scheduler. This is granted by the topolgy, - * either in case we were proxying (new bios are - * being redirected) or not (see the access check - * above). - */ - g_sched_forced_dispatch(gp); - error = g_sched_wait_pending(gp); - - if (error) { - /* - * Not all the requests came home: this might happen - * under heavy load, or if we were waiting for any - * bio which is served in the event path (see - * geom_slice.c for an example of how this can - * happen). Try to restore a working configuration - * if we can fail. - */ - if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { - g_sched_flush_pending(force ? - g_sched_blackhole : g_sched_start); - } - - /* - * In the forced destroy case there is not so much - * we can do, we have pending bios that will call - * g_sched_done() somehow, and we don't want them - * to crash the system using freed memory. We tell - * the user that something went wrong, and leak some - * memory here. - * Note: the callers using force = 1 ignore the - * return value. - */ - if (force) { - G_SCHED_DEBUG(0, "Pending requests while " - " destroying geom, some memory leaked."); - } - - return (error); - } - - g_sched_unlock(gp); - g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, - gsp, sc->sc_data); - sc->sc_hash = NULL; - gsp->gs_fini(sc->sc_data); - g_gsched_unref(gsp); - sc->sc_gsched = NULL; - } else - error = 0; - - if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { - error = g_destroy_proxy(gp, oldpp); - - if (error) { - if (force) { - G_SCHED_DEBUG(0, "Unrecoverable error while " - "destroying a proxy geom, leaking some " - " memory."); - } - - return (error); - } - } - - mtx_destroy(&sc->sc_mtx); - - g_free(gp->softc); - gp->softc = NULL; - g_wither_geom(gp, ENXIO); - - return (error); -} - -static int -g_sched_destroy_geom(struct gctl_req *req, struct g_class *mp, - struct g_geom *gp) -{ - - return (g_sched_destroy(gp, 0)); -} - -/* - * Functions related to the classification of requests. - * - * On recent FreeBSD versions (8.0 and above), we store a reference - * to the issuer of a request in bp->bio_classifier1 as soon - * as the bio is posted to the geom queue (and not later, because - * requests are managed by the g_down thread afterwards). - */ - -/* - * Classifier support for recent FreeBSD versions: we use - * a very simple classifier, only use curthread to tag a request. - * The classifier is registered at module load, and unregistered - * at module unload. - */ -static int -g_sched_tag(void *arg, struct bio *bp) -{ - - bp->bio_classifier1 = curthread; - return (1); -} - -static struct g_classifier_hook g_sched_classifier = { - .func = g_sched_tag, -}; - -static inline void -g_classifier_ini(void) -{ - - g_register_classifier(&g_sched_classifier); -} - -static inline void -g_classifier_fini(void) -{ - - g_unregister_classifier(&g_sched_classifier); -} - -static void -g_sched_init(struct g_class *mp) -{ - - g_gsched_global_init(); - - G_SCHED_DEBUG(0, "Loading: mp = %p, g_sched_class = %p.", - mp, &g_sched_class); - - /* Patch g_io_request to store classification info in the bio. */ - g_classifier_ini(); -} - -static void -g_sched_fini(struct g_class *mp) -{ - - g_classifier_fini(); - - G_SCHED_DEBUG(0, "Unloading..."); - - KASSERT(LIST_EMPTY(&me.gs_scheds), ("still registered schedulers")); - mtx_destroy(&me.gs_mtx); -} - -static int -g_sched_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, - struct thread *td) -{ - struct g_consumer *cp; - struct g_geom *gp; - - cp = LIST_FIRST(&pp->geom->consumer); - if (cp == NULL) - return (ENOIOCTL); - gp = cp->provider->geom; - if (gp->ioctl == NULL) - return (ENOIOCTL); - return (gp->ioctl(cp->provider, cmd, data, fflag, td)); -} - -/* - * Read the i-th argument for a request, skipping the /dev/ - * prefix if present. - */ -static const char * -g_sched_argi(struct gctl_req *req, int i) -{ - static const char *dev_prefix = "/dev/"; - const char *name; - char param[16]; - int l = strlen(dev_prefix); - - snprintf(param, sizeof(param), "arg%d", i); - name = gctl_get_asciiparam(req, param); - if (name == NULL) - gctl_error(req, "No 'arg%d' argument", i); - else if (strncmp(name, dev_prefix, l) == 0) - name += l; - return (name); -} - -/* - * Fetch nargs and do appropriate checks. - */ -static int -g_sched_get_nargs(struct gctl_req *req) -{ - int *nargs; - - nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); - if (nargs == NULL) { - gctl_error(req, "No 'nargs' argument"); - return (0); - } - if (*nargs <= 0) - gctl_error(req, "Missing device(s)."); - return (*nargs); -} - -/* - * Check whether we should add the class on certain volumes when - * this geom is created. Right now this is under control of a kenv - * variable containing the names of all devices that we care about. - * Probably we should only support transparent insertion as the - * preferred mode of operation. - */ -static struct g_geom * -g_sched_taste(struct g_class *mp, struct g_provider *pp, - int flags __unused) -{ - struct g_gsched *gsp = NULL; /* the . algorithm we want */ - const char *s; /* generic string pointer */ - const char *taste_names; /* devices we like */ - int l; - - g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, - mp->name, pp->name); - g_topology_assert(); - - G_SCHED_DEBUG(2, "Tasting %s.", pp->name); - - do { - /* do not taste on ourselves */ - if (pp->geom->class == mp) - break; - - taste_names = kern_getenv("geom.sched.taste"); - if (taste_names == NULL) - break; - - l = strlen(pp->name); - for (s = taste_names; *s && - (s = strstr(s, pp->name)); s++) { - /* further checks for an exact match */ - if ( (s == taste_names || s[-1] == ' ') && - (s[l] == '\0' || s[l] == ' ') ) - break; - } - if (s == NULL) - break; - G_SCHED_DEBUG(0, "Attach device %s match [%s]\n", - pp->name, s); - - /* look up the provider name in the list */ - s = kern_getenv("geom.sched.algo"); - if (s == NULL) - s = "rr"; - - gsp = g_gsched_find(s); /* also get a reference */ - if (gsp == NULL) { - G_SCHED_DEBUG(0, "Bad '%s' algorithm.", s); - break; - } - - /* XXX create with 1 as last argument ? */ - g_sched_create(NULL, mp, pp, gsp, 0); - g_gsched_unref(gsp); - } while (0); - return NULL; -} - -static void -g_sched_ctl_create(struct gctl_req *req, struct g_class *mp, int proxy) -{ - struct g_provider *pp; - struct g_gsched *gsp; - const char *name; - int i, nargs; - - g_topology_assert(); - - name = gctl_get_asciiparam(req, "algo"); - if (name == NULL) { - gctl_error(req, "No '%s' argument", "algo"); - return; - } - - gsp = g_gsched_find(name); /* also get a reference */ - if (gsp == NULL) { - gctl_error(req, "Bad algorithm '%s'", name); - return; - } - - nargs = g_sched_get_nargs(req); - - /* - * Run on the arguments, and break on any error. - * We look for a device name, but skip the /dev/ prefix if any. - */ - for (i = 0; i < nargs; i++) { - name = g_sched_argi(req, i); - if (name == NULL) - break; - pp = g_provider_by_name(name); - if (pp == NULL) { - G_SCHED_DEBUG(1, "Provider %s is invalid.", name); - gctl_error(req, "Provider %s is invalid.", name); - break; - } - if (g_sched_create(req, mp, pp, gsp, proxy) != 0) - break; - } - - g_gsched_unref(gsp); -} - -static void -g_sched_ctl_configure(struct gctl_req *req, struct g_class *mp) -{ - struct g_provider *pp; - struct g_gsched *gsp; - const char *name; - int i, nargs; - - g_topology_assert(); - - name = gctl_get_asciiparam(req, "algo"); - if (name == NULL) { - gctl_error(req, "No '%s' argument", "algo"); - return; - } - - gsp = g_gsched_find(name); /* also get a reference */ - if (gsp == NULL) { - gctl_error(req, "Bad algorithm '%s'", name); - return; - } - - nargs = g_sched_get_nargs(req); - - /* - * Run on the arguments, and break on any error. - * We look for a device name, but skip the /dev/ prefix if any. - */ - for (i = 0; i < nargs; i++) { - name = g_sched_argi(req, i); - if (name == NULL) - break; - pp = g_provider_by_name(name); - if (pp == NULL || pp->geom->class != mp) { - G_SCHED_DEBUG(1, "Provider %s is invalid.", name); - gctl_error(req, "Provider %s is invalid.", name); - break; - } - if (g_sched_change_algo(req, mp, pp, gsp) != 0) - break; - } - - g_gsched_unref(gsp); -} - -static struct g_geom * -g_sched_find_geom(struct g_class *mp, const char *name) -{ - struct g_geom *gp; - - LIST_FOREACH(gp, &mp->geom, geom) { - if (strcmp(gp->name, name) == 0) - return (gp); - } - return (NULL); -} - -static void -g_sched_ctl_destroy(struct gctl_req *req, struct g_class *mp) -{ - int nargs, *force, error, i; - struct g_geom *gp; - const char *name; - - g_topology_assert(); - - nargs = g_sched_get_nargs(req); - - force = gctl_get_paraml(req, "force", sizeof(*force)); - if (force == NULL) { - gctl_error(req, "No 'force' argument"); - return; - } - - for (i = 0; i < nargs; i++) { - name = g_sched_argi(req, i); - if (name == NULL) - break; - - gp = g_sched_find_geom(mp, name); - if (gp == NULL) { - G_SCHED_DEBUG(1, "Device %s is invalid.", name); - gctl_error(req, "Device %s is invalid.", name); - break; - } - - error = g_sched_destroy(gp, *force); - if (error != 0) { - gctl_error(req, "Cannot destroy device %s (error=%d).", - gp->name, error); - break; - } - } -} - -static void -g_sched_config(struct gctl_req *req, struct g_class *mp, const char *verb) -{ - uint32_t *version; - - g_topology_assert(); - - version = gctl_get_paraml(req, "version", sizeof(*version)); - if (version == NULL) { - gctl_error(req, "No '%s' argument.", "version"); - return; - } - - if (*version != G_SCHED_VERSION) { - gctl_error(req, "Userland and kernel parts are " - "out of sync."); - return; - } - - if (strcmp(verb, "create") == 0) { - g_sched_ctl_create(req, mp, 0); - return; - } else if (strcmp(verb, "insert") == 0) { - g_sched_ctl_create(req, mp, 1); - return; - } else if (strcmp(verb, "configure") == 0) { - g_sched_ctl_configure(req, mp); - return; - } else if (strcmp(verb, "destroy") == 0) { - g_sched_ctl_destroy(req, mp); - return; - } - - gctl_error(req, "Unknown verb."); -} - -static void -g_sched_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, - struct g_consumer *cp, struct g_provider *pp) -{ - struct g_sched_softc *sc = gp->softc; - struct g_gsched *gsp = sc->sc_gsched; - if (indent == NULL) { /* plaintext */ - sbuf_printf(sb, " algo %s", gsp ? gsp->gs_name : "--"); - } - if (gsp != NULL && gsp->gs_dumpconf) - gsp->gs_dumpconf(sb, indent, gp, cp, pp); -} - -DECLARE_GEOM_CLASS(g_sched_class, g_sched); -MODULE_VERSION(geom_sched, 0); Property changes on: head/sys/geom/sched/g_sched.c ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/geom/geom.h =================================================================== --- head/sys/geom/geom.h (revision 356184) +++ head/sys/geom/geom.h (revision 356185) @@ -1,445 +1,432 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _GEOM_GEOM_H_ #define _GEOM_GEOM_H_ #include #include #include #include #include #include #include struct g_class; struct g_geom; struct g_consumer; struct g_provider; struct g_stat; struct thread; struct bio; struct sbuf; struct gctl_req; struct g_configargs; struct disk_zone_args; typedef int g_config_t (struct g_configargs *ca); typedef void g_ctl_req_t (struct gctl_req *, struct g_class *cp, char const *verb); typedef int g_ctl_create_geom_t (struct gctl_req *, struct g_class *cp, struct g_provider *pp); typedef int g_ctl_destroy_geom_t (struct gctl_req *, struct g_class *cp, struct g_geom *gp); typedef int g_ctl_config_geom_t (struct gctl_req *, struct g_geom *gp, const char *verb); typedef void g_init_t (struct g_class *mp); typedef void g_fini_t (struct g_class *mp); typedef struct g_geom * g_taste_t (struct g_class *, struct g_provider *, int flags); typedef int g_ioctl_t(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td); #define G_TF_NORMAL 0 #define G_TF_INSIST 1 #define G_TF_TRANSPARENT 2 typedef int g_access_t (struct g_provider *, int, int, int); /* XXX: not sure about the thread arg */ typedef void g_orphan_t (struct g_consumer *); typedef void g_start_t (struct bio *); typedef void g_spoiled_t (struct g_consumer *); typedef void g_attrchanged_t (struct g_consumer *, const char *attr); typedef void g_provgone_t (struct g_provider *); typedef void g_dumpconf_t (struct sbuf *, const char *indent, struct g_geom *, struct g_consumer *, struct g_provider *); typedef void g_resize_t(struct g_consumer *cp); /* * The g_class structure describes a transformation class. In other words * all BSD disklabel handlers share one g_class, all MBR handlers share * one common g_class and so on. * Certain operations are instantiated on the class, most notably the * taste and config_geom functions. */ struct g_class { const char *name; u_int version; u_int spare0; g_taste_t *taste; g_config_t *config; g_ctl_req_t *ctlreq; g_init_t *init; g_fini_t *fini; g_ctl_destroy_geom_t *destroy_geom; /* * Default values for geom methods */ g_start_t *start; g_spoiled_t *spoiled; g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; g_ioctl_t *ioctl; g_provgone_t *providergone; g_resize_t *resize; void *spare1; void *spare2; /* * The remaining elements are private */ LIST_ENTRY(g_class) class; LIST_HEAD(,g_geom) geom; }; /* * The g_geom_alias is a list node for aliases for the geom name * for device node creation. */ struct g_geom_alias { LIST_ENTRY(g_geom_alias) ga_next; const char *ga_alias; }; #define G_VERSION_00 0x19950323 #define G_VERSION_01 0x20041207 /* add fflag to g_ioctl_t */ #define G_VERSION G_VERSION_01 /* * The g_geom is an instance of a g_class. */ struct g_geom { char *name; struct g_class *class; LIST_ENTRY(g_geom) geom; LIST_HEAD(,g_consumer) consumer; LIST_HEAD(,g_provider) provider; TAILQ_ENTRY(g_geom) geoms; /* XXX: better name */ int rank; g_start_t *start; g_spoiled_t *spoiled; g_attrchanged_t *attrchanged; g_dumpconf_t *dumpconf; g_access_t *access; g_orphan_t *orphan; g_ioctl_t *ioctl; g_provgone_t *providergone; g_resize_t *resize; void *spare0; void *spare1; void *softc; unsigned flags; #define G_GEOM_WITHER 0x01 #define G_GEOM_VOLATILE_BIO 0x02 #define G_GEOM_IN_ACCESS 0x04 #define G_GEOM_ACCESS_WAIT 0x08 LIST_HEAD(,g_geom_alias) aliases; }; /* * The g_bioq is a queue of struct bio's. * XXX: possibly collection point for statistics. * XXX: should (possibly) be collapsed with sys/bio.h::bio_queue_head. */ struct g_bioq { TAILQ_HEAD(, bio) bio_queue; struct mtx bio_queue_lock; int bio_queue_length; }; /* * A g_consumer is an attachment point for a g_provider. One g_consumer * can only be attached to one g_provider, but multiple g_consumers * can be attached to one g_provider. */ struct g_consumer { struct g_geom *geom; LIST_ENTRY(g_consumer) consumer; struct g_provider *provider; LIST_ENTRY(g_consumer) consumers; /* XXX: better name */ int acr, acw, ace; int flags; #define G_CF_SPOILED 0x1 #define G_CF_ORPHAN 0x4 #define G_CF_DIRECT_SEND 0x10 #define G_CF_DIRECT_RECEIVE 0x20 struct devstat *stat; u_int nstart, nend; /* Two fields for the implementing class to use */ void *private; u_int index; }; /* * A g_provider is a "logical disk". */ struct g_provider { char *name; LIST_ENTRY(g_provider) provider; struct g_geom *geom; LIST_HEAD(,g_consumer) consumers; int acr, acw, ace; int error; TAILQ_ENTRY(g_provider) orphan; off_t mediasize; u_int sectorsize; off_t stripesize; off_t stripeoffset; struct devstat *stat; u_int nstart, nend; u_int flags; #define G_PF_WITHER 0x2 #define G_PF_ORPHAN 0x4 #define G_PF_ACCEPT_UNMAPPED 0x8 #define G_PF_DIRECT_SEND 0x10 #define G_PF_DIRECT_RECEIVE 0x20 /* Two fields for the implementing class to use */ void *private; u_int index; }; -/* - * Descriptor of a classifier. We can register a function and - * an argument, which is called by g_io_request() on bio's - * that are not previously classified. - */ -struct g_classifier_hook { - TAILQ_ENTRY(g_classifier_hook) link; - int (*func)(void *arg, struct bio *bp); - void *arg; -}; - /* BIO_GETATTR("GEOM::setstate") argument values. */ #define G_STATE_FAILED 0 #define G_STATE_REBUILD 1 #define G_STATE_RESYNC 2 #define G_STATE_ACTIVE 3 /* geom_dev.c */ struct cdev; void g_dev_print(void); void g_dev_physpath_changed(void); struct g_provider *g_dev_getprovider(struct cdev *dev); /* geom_dump.c */ void (g_trace)(int level, const char *, ...) __printflike(2, 3); #define G_T_TOPOLOGY 0x01 #define G_T_BIO 0x02 #define G_T_ACCESS 0x04 extern int g_debugflags; #define G_F_FOOTSHOOTING 0x10 #define G_F_DISKIOCTL 0x40 #define G_F_CTLDUMP 0x80 #define g_trace(level, fmt, ...) do { \ if (__predict_false(g_debugflags & (level))) \ (g_trace)(level, fmt, ## __VA_ARGS__); \ } while (0) /* geom_event.c */ typedef void g_event_t(void *, int flag); #define EV_CANCEL 1 int g_post_event(g_event_t *func, void *arg, int flag, ...); int g_waitfor_event(g_event_t *func, void *arg, int flag, ...); void g_cancel_event(void *ref); int g_attr_changed(struct g_provider *pp, const char *attr, int flag); int g_media_changed(struct g_provider *pp, int flag); int g_media_gone(struct g_provider *pp, int flag); void g_orphan_provider(struct g_provider *pp, int error); void g_waitidlelock(void); /* geom_subr.c */ int g_access(struct g_consumer *cp, int nread, int nwrite, int nexcl); int g_attach(struct g_consumer *cp, struct g_provider *pp); int g_compare_names(const char *namea, const char *nameb); void g_destroy_consumer(struct g_consumer *cp); void g_destroy_geom(struct g_geom *pp); void g_destroy_provider(struct g_provider *pp); void g_detach(struct g_consumer *cp); void g_error_provider(struct g_provider *pp, int error); struct g_provider *g_provider_by_name(char const *arg); void g_geom_add_alias(struct g_geom *gp, const char *alias); int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len); #define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof *(v)) int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len); int g_handleattr_int(struct bio *bp, const char *attribute, int val); int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val); int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val); int g_handleattr_str(struct bio *bp, const char *attribute, const char *str); struct g_consumer * g_new_consumer(struct g_geom *gp); struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) __printflike(2, 3); struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) __printflike(2, 3); void g_resize_provider(struct g_provider *pp, off_t size); int g_retaste(struct g_class *mp); void g_spoil(struct g_provider *pp, struct g_consumer *cp); int g_std_access(struct g_provider *pp, int dr, int dw, int de); void g_std_done(struct bio *bp); void g_std_spoiled(struct g_consumer *cp); void g_wither_geom(struct g_geom *gp, int error); void g_wither_geom_close(struct g_geom *gp, int error); void g_wither_provider(struct g_provider *pp, int error); #if defined(DIAGNOSTIC) || defined(DDB) int g_valid_obj(void const *ptr); #endif #ifdef DIAGNOSTIC #define G_VALID_CLASS(foo) \ KASSERT(g_valid_obj(foo) == 1, ("%p is not a g_class", foo)) #define G_VALID_GEOM(foo) \ KASSERT(g_valid_obj(foo) == 2, ("%p is not a g_geom", foo)) #define G_VALID_CONSUMER(foo) \ KASSERT(g_valid_obj(foo) == 3, ("%p is not a g_consumer", foo)) #define G_VALID_PROVIDER(foo) \ KASSERT(g_valid_obj(foo) == 4, ("%p is not a g_provider", foo)) #else #define G_VALID_CLASS(foo) do { } while (0) #define G_VALID_GEOM(foo) do { } while (0) #define G_VALID_CONSUMER(foo) do { } while (0) #define G_VALID_PROVIDER(foo) do { } while (0) #endif int g_modevent(module_t, int, void *); /* geom_io.c */ struct bio * g_clone_bio(struct bio *); struct bio * g_duplicate_bio(struct bio *); void g_destroy_bio(struct bio *); void g_io_deliver(struct bio *bp, int error); int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp); int g_io_flush(struct g_consumer *cp); int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp); -int g_register_classifier(struct g_classifier_hook *hook); -void g_unregister_classifier(struct g_classifier_hook *hook); void g_io_request(struct bio *bp, struct g_consumer *cp); struct bio *g_new_bio(void); struct bio *g_alloc_bio(void); void g_reset_bio(struct bio *); void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error); int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length); int g_delete_data(struct g_consumer *cp, off_t offset, off_t length); void g_format_bio(struct sbuf *, const struct bio *bp); void g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix, ...) __printflike(3, 4); int g_use_g_read_data(void *, off_t, void **, int); int g_use_g_write_data(void *, off_t, void *, int); /* geom_kern.c / geom_kernsim.c */ #ifdef _KERNEL extern struct sx topology_lock; struct g_kerneldump { off_t offset; off_t length; struct dumperinfo di; }; MALLOC_DECLARE(M_GEOM); static __inline void * g_malloc(int size, int flags) { void *p; p = malloc(size, M_GEOM, flags); return (p); } static __inline void g_free(void *ptr) { #ifdef DIAGNOSTIC if (sx_xlocked(&topology_lock)) { KASSERT(g_valid_obj(ptr) == 0, ("g_free(%p) of live object, type %d", ptr, g_valid_obj(ptr))); } #endif free(ptr, M_GEOM); } #define g_topology_lock() \ do { \ sx_xlock(&topology_lock); \ } while (0) #define g_topology_try_lock() sx_try_xlock(&topology_lock) #define g_topology_unlock() \ do { \ sx_xunlock(&topology_lock); \ } while (0) #define g_topology_assert() \ do { \ sx_assert(&topology_lock, SX_XLOCKED); \ } while (0) #define g_topology_assert_not() \ do { \ sx_assert(&topology_lock, SX_UNLOCKED); \ } while (0) #define g_topology_sleep(chan, timo) \ sx_sleep(chan, &topology_lock, 0, "gtopol", timo) #define DECLARE_GEOM_CLASS(class, name) \ static moduledata_t name##_mod = { \ #name, g_modevent, &class \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); int g_is_geom_thread(struct thread *td); #endif /* _KERNEL */ /* geom_ctl.c */ int gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len); void gctl_set_param_err(struct gctl_req *req, const char *param, void const *ptr, int len); void *gctl_get_param(struct gctl_req *req, const char *param, int *len); char const *gctl_get_asciiparam(struct gctl_req *req, const char *param); void *gctl_get_paraml(struct gctl_req *req, const char *param, int len); void *gctl_get_paraml_opt(struct gctl_req *req, const char *param, int len); int gctl_error(struct gctl_req *req, const char *fmt, ...) __printflike(2, 3); struct g_class *gctl_get_class(struct gctl_req *req, char const *arg); struct g_geom *gctl_get_geom(struct gctl_req *req, struct g_class *mpr, char const *arg); struct g_provider *gctl_get_provider(struct gctl_req *req, char const *arg); #endif /* _GEOM_GEOM_H_ */ Index: head/sys/geom/geom_io.c =================================================================== --- head/sys/geom/geom_io.c (revision 356184) +++ head/sys/geom/geom_io.c (revision 356185) @@ -1,1163 +1,1086 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ static volatile u_int __read_mostly pace; static uma_zone_t __read_mostly biozone; -/* - * The head of the list of classifiers used in g_io_request. - * Use g_register_classifier() and g_unregister_classifier() - * to add/remove entries to the list. - * Classifiers are invoked in registration order. - */ -static TAILQ_HEAD(, g_classifier_hook) g_classifier_tailq __read_mostly = - TAILQ_HEAD_INITIALIZER(g_classifier_tailq); - #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; if (bp->bio_cmd == BIO_ZONE) bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); - /* Inherit classification info from the parent */ - bp2->bio_classifier1 = bp->bio_classifier1; - bp2->bio_classifier2 = bp->bio_classifier2; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) bp2->bio_track_bp = bp->bio_track_bp; #endif bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); bp = g_alloc_bio(); bp->bio_cmd = BIO_ZONE; bp->bio_done = NULL; /* * XXX KDM need to handle report zone data. */ bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) bp->bio_length = zone_args->zone_params.report.entries_allocated * sizeof(struct disk_zone_rep_entry); else bp->bio_length = 0; g_io_request(bp, cp); error = biowait(bp, "gzone"); bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); g_destroy_bio(bp); return (error); } /* * Send a BIO_SPEEDUP down the stack. This is used to tell the lower layers that * the upper layers have detected a resource shortage. The lower layers are * advised to stop delaying I/O that they might be holding for performance * reasons and to schedule it (non-trims) or complete it successfully (trims) as * quickly as it can. bio_length is the amount of the shortage. This call * should be non-blocking. bio_resid is used to communicate back if the lower * layers couldn't find bio_length worth of I/O to schedule or discard. A length * of 0 means to do as much as you can (schedule the h/w queues full, discard * all trims). flags are a hint from the upper layers to the lower layers what * operation should be done. */ int g_io_speedup(size_t shortage, u_int flags, size_t *resid, struct g_consumer *cp) { struct bio *bp; int error; KASSERT((flags & (BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE)) != 0, ("Invalid flags passed to g_io_speedup: %#x", flags)); g_trace(G_T_BIO, "bio_speedup(%s, %zu, %#x)", cp->provider->name, shortage, flags); bp = g_new_bio(); if (bp == NULL) return (ENOMEM); bp->bio_cmd = BIO_SPEEDUP; bp->bio_length = shortage; bp->bio_done = NULL; bp->bio_flags |= flags; g_io_request(bp, cp); error = biowait(bp, "gflush"); *resid = bp->bio_resid; g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; biotrack(bp, __func__); cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; case BIO_ZONE: if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { if (cp->acr == 0) return (EPERM); } else if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } -/* - * bio classification support. - * - * g_register_classifier() and g_unregister_classifier() - * are used to add/remove a classifier from the list. - * The list is protected using the g_bio_run_down lock, - * because the classifiers are called in this path. - * - * g_io_request() passes bio's that are not already classified - * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers(). - * Classifiers can store their result in the two fields - * bio_classifier1 and bio_classifier2. - * A classifier that updates one of the fields should - * return a non-zero value. - * If no classifier updates the field, g_run_classifiers() sets - * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls. - */ - -int -g_register_classifier(struct g_classifier_hook *hook) -{ - - g_bioq_lock(&g_bio_run_down); - TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link); - g_bioq_unlock(&g_bio_run_down); - - return (0); -} - void -g_unregister_classifier(struct g_classifier_hook *hook) -{ - struct g_classifier_hook *entry; - - g_bioq_lock(&g_bio_run_down); - TAILQ_FOREACH(entry, &g_classifier_tailq, link) { - if (entry == hook) { - TAILQ_REMOVE(&g_classifier_tailq, hook, link); - break; - } - } - g_bioq_unlock(&g_bio_run_down); -} - -static void -g_run_classifiers(struct bio *bp) -{ - struct g_classifier_hook *hook; - int classified = 0; - - biotrack(bp, __func__); - - TAILQ_FOREACH(hook, &g_classifier_tailq, link) - classified |= hook->func(hook->arg, bp); - - if (!classified) - bp->bio_classifier1 = BIO_NOTCLASSIFIED; -} - -void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; struct mtx *mtxp; int direct, error, first; uint8_t cmd; biotrack(bp, __func__); KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif - - if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) { - g_bioq_lock(&g_bio_run_down); - g_run_classifiers(bp); - g_bioq_unlock(&g_bio_run_down); - } /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ mtxp = mtx_pool_find(mtxpool_sleep, pp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_start_transaction(pp->stat, &bp->bio_t0); if (g_collectstats & G_STATS_CONSUMERS) devstat_start_transaction(cp->stat, &bp->bio_t0); pp->nstart++; cp->nstart++; mtx_unlock(mtxp); if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; biotrack(bp, __func__); KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); cp->nend++; pp->nend++; mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); biotrack(bp, __func__); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_up); if (bp == NULL) { CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } /* * A read function for use by ffs_sbget when used by GEOM-layer routines. */ int g_use_g_read_data(void *devfd, off_t loc, void **bufp, int size) { struct g_consumer *cp; KASSERT(*bufp == NULL, ("g_use_g_read_data: non-NULL *bufp %p\n", *bufp)); cp = (struct g_consumer *)devfd; /* * Take care not to issue an invalid I/O request. The offset of * the superblock candidate must be multiples of the provider's * sector size, otherwise an FFS can't exist on the provider * anyway. */ if (loc % cp->provider->sectorsize != 0) return (ENOENT); *bufp = g_read_data(cp, loc, size, NULL); if (*bufp == NULL) return (ENOENT); return (0); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } /* * A write function for use by ffs_sbput when used by GEOM-layer routines. */ int g_use_g_write_data(void *devfd, off_t loc, void *buf, int size) { return (g_write_data((struct g_consumer *)devfd, loc, buf, size)); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void g_print_bio(const char *prefix, const struct bio *bp, const char *fmtsuffix, ...) { #ifndef PRINTF_BUFR_SIZE #define PRINTF_BUFR_SIZE 64 #endif char bufr[PRINTF_BUFR_SIZE]; struct sbuf sb, *sbp __unused; va_list ap; sbp = sbuf_new(&sb, bufr, sizeof(bufr), SBUF_FIXEDLEN); KASSERT(sbp != NULL, ("sbuf_new misused?")); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_cat(&sb, prefix); g_format_bio(&sb, bp); va_start(ap, fmtsuffix); sbuf_vprintf(&sb, fmtsuffix, ap); va_end(ap); sbuf_nl_terminate(&sb); sbuf_finish(&sb); sbuf_delete(&sb); } void g_format_bio(struct sbuf *sb, const struct bio *bp) { const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; sbuf_printf(sb, "%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; sbuf_printf(sb, "%s[%s]", pname, cmd); return; case BIO_ZONE: { char *subcmd = NULL; cmd = "ZONE"; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: subcmd = "OPEN"; break; case DISK_ZONE_CLOSE: subcmd = "CLOSE"; break; case DISK_ZONE_FINISH: subcmd = "FINISH"; break; case DISK_ZONE_RWP: subcmd = "RWP"; break; case DISK_ZONE_REPORT_ZONES: subcmd = "REPORT ZONES"; break; case DISK_ZONE_GET_PARAMS: subcmd = "GET PARAMS"; break; default: subcmd = "UNKNOWN"; break; } sbuf_printf(sb, "%s[%s,%s]", pname, cmd, subcmd); return; } case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; sbuf_printf(sb, "%s[%s()]", pname, cmd); return; } sbuf_printf(sb, "%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: head/sys/modules/geom/geom_sched/gsched_delay/Makefile =================================================================== --- head/sys/modules/geom/geom_sched/gsched_delay/Makefile (revision 356184) +++ head/sys/modules/geom/geom_sched/gsched_delay/Makefile (nonexistent) @@ -1,7 +0,0 @@ -# $FreeBSD$ - -KMOD= gsched_delay -SRCS= gs_delay.c - -# ../Makefile.inc automatically included -.include Property changes on: head/sys/modules/geom/geom_sched/gsched_delay/Makefile ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: head/sys/modules/geom/geom_sched/Makefile.inc =================================================================== --- head/sys/modules/geom/geom_sched/Makefile.inc (revision 356184) +++ head/sys/modules/geom/geom_sched/Makefile.inc (nonexistent) @@ -1,9 +0,0 @@ -# $FreeBSD$ -# included by geom_sched children - -.PATH: ${SRCTOP}/sys/geom/sched - -# 6.x needs this path -#CFLAGS += -I${SRCTOP}/sys/geom/sched - -# .include Property changes on: head/sys/modules/geom/geom_sched/Makefile.inc ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/modules/geom/geom_sched/gs_sched/Makefile =================================================================== --- head/sys/modules/geom/geom_sched/gs_sched/Makefile (revision 356184) +++ head/sys/modules/geom/geom_sched/gs_sched/Makefile (nonexistent) @@ -1,6 +0,0 @@ -# $FreeBSD$ -KMOD= geom_sched -SRCS= g_sched.c - -# ../Makefile.inc automatically included -.include Property changes on: head/sys/modules/geom/geom_sched/gs_sched/Makefile ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/modules/geom/geom_sched/Makefile =================================================================== --- head/sys/modules/geom/geom_sched/Makefile (revision 356184) +++ head/sys/modules/geom/geom_sched/Makefile (nonexistent) @@ -1,5 +0,0 @@ -# $FreeBSD$ - -SUBDIR= gs_sched gsched_rr gsched_delay - -.include Property changes on: head/sys/modules/geom/geom_sched/Makefile ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/modules/geom/geom_sched/gsched_rr/Makefile =================================================================== --- head/sys/modules/geom/geom_sched/gsched_rr/Makefile (revision 356184) +++ head/sys/modules/geom/geom_sched/gsched_rr/Makefile (nonexistent) @@ -1,7 +0,0 @@ -# $FreeBSD$ - -KMOD= gsched_rr -SRCS= gs_rr.c - -# ../Makefile.inc automatically included -.include Property changes on: head/sys/modules/geom/geom_sched/gsched_rr/Makefile ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: head/sys/modules/geom/Makefile =================================================================== --- head/sys/modules/geom/Makefile (revision 356184) +++ head/sys/modules/geom/Makefile (revision 356185) @@ -1,34 +1,33 @@ # $FreeBSD$ SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" SUBDIR= geom_bde \ geom_cache \ geom_concat \ geom_eli \ geom_flashmap \ geom_gate \ geom_journal \ geom_label \ geom_linux_lvm \ geom_mirror \ geom_mountver \ geom_multipath \ geom_nop \ geom_part \ geom_raid \ geom_raid3 \ - geom_sched \ geom_shsec \ geom_stripe \ geom_uzip \ geom_vinum \ geom_virstor \ geom_zero .if ${MK_CCD} != "no" || defined(ALL_MODULES) SUBDIR+= geom_ccd .endif .include Index: head/sys/sys/bio.h =================================================================== --- head/sys/sys/bio.h (revision 356184) +++ head/sys/sys/bio.h (revision 356185) @@ -1,191 +1,188 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BIO_H_ #define _SYS_BIO_H_ #include #include /* bio_cmd */ #define BIO_READ 0x01 /* Read I/O data */ #define BIO_WRITE 0x02 /* Write I/O data */ #define BIO_DELETE 0x03 /* TRIM or free blocks, i.e. mark as unused */ #define BIO_GETATTR 0x04 /* Get GEOM attributes of object */ #define BIO_FLUSH 0x05 /* Commit outstanding I/O now */ #define BIO_CMD0 0x06 /* Available for local hacks */ #define BIO_CMD1 0x07 /* Available for local hacks */ #define BIO_CMD2 0x08 /* Available for local hacks */ #define BIO_ZONE 0x09 /* Zone command */ #define BIO_SPEEDUP 0x0a /* Upper layers face shortage */ /* bio_flags */ #define BIO_ERROR 0x01 /* An error occurred processing this bio. */ #define BIO_DONE 0x02 /* This bio is finished. */ #define BIO_ONQUEUE 0x04 /* This bio is in a queue & not yet taken. */ /* * This bio must be executed after all previous bios in the queue have been * executed, and before any successive bios can be executed. */ #define BIO_ORDERED 0x08 #define BIO_UNMAPPED 0x10 #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 #define PRINT_BIO_FLAGS "\20\7vlist\6transient_mapping\5unmapped" \ "\4ordered\3onqueue\2done\1error" #define BIO_SPEEDUP_WRITE 0x4000 /* Resource shortage at upper layers */ #define BIO_SPEEDUP_TRIM 0x8000 /* Resource shortage at upper layers */ #ifdef _KERNEL struct disk; struct bio; struct vm_map; -/* Empty classifier tag, to prevent further classification. */ -#define BIO_NOTCLASSIFIED (void *)(~0UL) - typedef void bio_task_t(void *); /* * The bio structure describes an I/O operation in the kernel. */ struct bio { uint16_t bio_cmd; /* I/O operation. */ uint16_t bio_flags; /* General flags. */ uint16_t bio_cflags; /* Private use by the consumer. */ uint16_t bio_pflags; /* Private use by the provider. */ struct cdev *bio_dev; /* Device to do I/O on. */ struct disk *bio_disk; /* Valid below geom_disk.c only */ off_t bio_offset; /* Offset into file. */ long bio_bcount; /* Valid bytes in buffer. */ caddr_t bio_data; /* Memory, superblocks, indirect etc. */ struct vm_page **bio_ma; /* Or unmapped. */ int bio_ma_offset; /* Offset in the first page of bio_ma. */ int bio_ma_n; /* Number of pages in bio_ma. */ int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); void *bio_driver1; /* Private use by the provider. */ void *bio_driver2; /* Private use by the provider. */ void *bio_caller1; /* Private use by the consumer. */ void *bio_caller2; /* Private use by the consumer. */ TAILQ_ENTRY(bio) bio_queue; /* Disksort queue. */ const char *bio_attribute; /* Attribute for BIO_[GS]ETATTR */ struct disk_zone_args bio_zone;/* Used for BIO_ZONE */ struct g_consumer *bio_from; /* GEOM linkage */ struct g_provider *bio_to; /* GEOM linkage */ off_t bio_length; /* Like bio_bcount */ off_t bio_completed; /* Inverse of bio_resid */ u_int bio_children; /* Number of spawned bios */ u_int bio_inbed; /* Children safely home by now */ struct bio *bio_parent; /* Pointer to parent */ struct bintime bio_t0; /* Time request started */ bio_task_t *bio_task; /* Task_queue handler */ void *bio_task_arg; /* Argument to above */ - void *bio_classifier1; /* Classifier tag. */ - void *bio_classifier2; /* Classifier tag. */ + void *bio_spare1; + void *bio_spare2; #ifdef DIAGNOSTIC void *_bio_caller1; void *_bio_caller2; uint8_t _bio_cflags; #endif #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) struct buf *bio_track_bp; /* Parent buf for tracking */ #endif /* XXX: these go away when bio chaining is introduced */ daddr_t bio_pblkno; /* physical block number */ }; struct uio; struct devstat; struct bio_queue_head { TAILQ_HEAD(bio_queue, bio) queue; off_t last_offset; struct bio *insert_point; int total; int batched; }; extern struct vm_map *bio_transient_map; extern int bio_transient_maxcnt; void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location); static __inline void biotrack(struct bio *bp, const char *location) { if (bp->bio_track_bp != NULL) biotrack_buf(bp, location); } #else static __inline void biotrack(struct bio *bp __unused, const char *location __unused) { } #endif void bioq_disksort(struct bio_queue_head *ap, struct bio *bp); struct bio *bioq_first(struct bio_queue_head *head); struct bio *bioq_takefirst(struct bio_queue_head *head); void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error); void bioq_init(struct bio_queue_head *head); void bioq_insert_head(struct bio_queue_head *head, struct bio *bp); void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp); void bioq_remove(struct bio_queue_head *head, struct bio *bp); int physio(struct cdev *dev, struct uio *uio, int ioflag); #define physread physio #define physwrite physio #endif /* _KERNEL */ #endif /* !_SYS_BIO_H_ */ Index: head/sys/sys/ktr_class.h =================================================================== --- head/sys/sys/ktr_class.h (revision 356184) +++ head/sys/sys/ktr_class.h (revision 356185) @@ -1,89 +1,89 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1996 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: ktr.h,v 1.10.2.7 2000/03/16 21:44:42 cp Exp $ * $FreeBSD$ */ #ifndef _SYS_KTR_CLASS_H_ #define _SYS_KTR_CLASS_H_ /* * KTR trace classes * * Two of the trace classes (KTR_DEV and KTR_SUBSYS) are special in that * they are really placeholders so that indvidual drivers and subsystems * can map their internal tracing to the general class when they wish to * have tracing enabled and map it to 0 when they don't. */ #define KTR_GEN 0x00000001 /* General (TR) */ #define KTR_NET 0x00000002 /* Network */ #define KTR_DEV 0x00000004 /* Device driver */ #define KTR_LOCK 0x00000008 /* MP locking */ #define KTR_SMP 0x00000010 /* MP general */ #define KTR_SUBSYS 0x00000020 /* Subsystem. */ #define KTR_PMAP 0x00000040 /* Pmap tracing */ #define KTR_MALLOC 0x00000080 /* Malloc tracing */ #define KTR_TRAP 0x00000100 /* Trap processing */ #define KTR_INTR 0x00000200 /* Interrupt tracing */ #define KTR_SIG 0x00000400 /* Signal processing */ #define KTR_SPARE2 0x00000800 /* cxgb, amd64, xen, clk, &c */ #define KTR_PROC 0x00001000 /* Process scheduling */ #define KTR_SYSC 0x00002000 /* System call */ #define KTR_INIT 0x00004000 /* System initialization */ #define KTR_SPARE3 0x00008000 /* cxgb, drm2, ioat, ntb */ -#define KTR_SPARE4 0x00010000 /* geom_sched */ +#define KTR_SPARE4 0x00010000 #define KTR_EVH 0x00020000 /* Eventhandler */ #define KTR_VFS 0x00040000 /* VFS events */ #define KTR_VOP 0x00080000 /* Auto-generated vop events */ #define KTR_VM 0x00100000 /* The virtual memory system */ #define KTR_INET 0x00200000 /* IPv4 stack */ #define KTR_RUNQ 0x00400000 /* Run queue */ #define KTR_SPARE5 0x00800000 #define KTR_UMA 0x01000000 /* UMA slab allocator */ #define KTR_CALLOUT 0x02000000 /* Callouts and timeouts */ #define KTR_GEOM 0x04000000 /* GEOM I/O events */ #define KTR_BUSDMA 0x08000000 /* busdma(9) events */ #define KTR_INET6 0x10000000 /* IPv6 stack */ #define KTR_SCHED 0x20000000 /* Machine parsed sched info. */ #define KTR_BUF 0x40000000 /* Buffer cache */ #define KTR_PTRACE 0x80000000 /* Process debugging. */ #define KTR_ALL 0xffffffff /* KTR trace classes to compile in */ #ifdef KTR #ifndef KTR_COMPILE #define KTR_COMPILE (KTR_ALL) #endif #else /* !KTR */ #undef KTR_COMPILE #define KTR_COMPILE 0 #endif /* KTR */ #endif /* !_SYS_KTR_CLASS_H_ */ Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 356184) +++ head/sys/sys/param.h (revision 356185) @@ -1,368 +1,368 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1300071 /* Master, propagated to newvers */ +#define __FreeBSD_version 1300072 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_CK_SUPERBLOCK 1300000 #define P_OSREL_CK_INODE 1300005 #define P_OSREL_POWERPC_NEW_AUX_ARGS 1300070 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 255 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */