diff --git a/include/Makefile b/include/Makefile --- a/include/Makefile +++ b/include/Makefile @@ -50,7 +50,7 @@ fs/procfs fs/smbfs fs/udf fs/unionfs \ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ geom/mirror geom/mountver geom/multipath geom/nop \ - geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ + geom/raid geom/raid3 geom/shsec geom/stripe geom/union geom/virstor \ net/altq \ net/route \ netgraph/atm netgraph/netflow \ diff --git a/lib/geom/Makefile.classes b/lib/geom/Makefile.classes --- a/lib/geom/Makefile.classes +++ b/lib/geom/Makefile.classes @@ -22,4 +22,5 @@ GEOM_CLASSES+= raid3 GEOM_CLASSES+= shsec GEOM_CLASSES+= stripe +GEOM_CLASSES+= union GEOM_CLASSES+= virstor diff --git a/lib/geom/union/Makefile b/lib/geom/union/Makefile new file mode 100644 --- /dev/null +++ b/lib/geom/union/Makefile @@ -0,0 +1,8 @@ +# $FreeBSD$ + +PACKAGE=runtime +.PATH: ${.CURDIR:H:H}/misc + +GEOM_CLASS= union + +.include diff --git a/lib/geom/union/Makefile.depend b/lib/geom/union/Makefile.depend new file mode 100644 --- /dev/null +++ b/lib/geom/union/Makefile.depend @@ -0,0 +1,19 @@ +# $FreeBSD$ +# Autogenerated - do NOT edit! + +DIRDEPS = \ + gnu/lib/csu \ + include \ + include/xlocale \ + lib/${CSU_DIR} \ + lib/libc \ + lib/libcompiler_rt \ + lib/libgeom \ + sbin/geom/core \ + + +.include + +.if ${DEP_RELDIR} == ${_DEP_RELDIR} +# local dependencies - needed for -jN in clean tree +.endif diff --git a/lib/geom/union/geom_union.c b/lib/geom/union/geom_union.c new file mode 100644 --- /dev/null +++ b/lib/geom/union/geom_union.c @@ -0,0 +1,82 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Marshall Kirk McKusick + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "core/geom.h" + +uint32_t lib_version = G_LIB_VERSION; +uint32_t version = G_UNION_VERSION; + +struct g_command class_commands[] = { + { "create", G_FLAG_LOADKLD, NULL, + { + { 'o', "offset", "0", G_TYPE_NUMBER }, + { 's', "size", "0", G_TYPE_NUMBER }, + { 'S', "secsize", "0", G_TYPE_NUMBER }, + { 'v', "verbose", NULL, G_TYPE_BOOL }, + { 'Z', "gunionname", G_VAL_OPTIONAL, G_TYPE_STRING }, + G_OPT_SENTINEL + }, + "[-v] [-o offset] [-s size] [-S secsize] [-Z gunionname] " + "upperdev lowerdev" + }, + { "destroy", 0, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + { 'v', "verbose", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + "[-fv] prov ..." + }, + { "reset", 0, NULL, + { + { 'v', "verbose", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + "[-v] prov ..." + }, + { "commit", 0, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + { 'v', "verbose", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + "[-v] prov ..." + }, + { "revert", 0, NULL, + { + { 'v', "verbose", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + "[-v] prov ..." + }, + G_CMD_SENTINEL +}; diff --git a/lib/geom/union/gunion.8 b/lib/geom/union/gunion.8 new file mode 100644 --- /dev/null +++ b/lib/geom/union/gunion.8 @@ -0,0 +1,305 @@ +.\" +.\" Copyright (c) 2021 Marshall Kirk McKusick +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd October 30, 2021 +.Dt GUNION 8 +.Os +.Sh NAME +.Nm gunion +.Nd "control utility for UNION GEOM class" +.Sh SYNOPSIS +.Nm +.Cm create +.Op Fl v +.Op Fl o Ar offset +.Op Fl s Ar size +.Op Fl S Ar secsize +.Op Fl Z Ar gunionname +.Ar upperdev lowerdev +.Nm +.Cm destroy +.Op Fl fv +.Ar prov ... +.Nm +.Cm reset +.Op Fl v +.Ar prov ... +.Nm +.Cm revert +.Op Fl v +.Ar prov ... +.Nm +.Cm commit +.Op Fl fv +.Ar prov ... +.Nm +.Cm list +.Nm +.Cm status +.Nm +.Cm load +.Nm +.Cm unload +.Sh DESCRIPTION +The +.Nm +utility is used to track changes to a read-only disk on a writable disk. +Logically, a writable disk is placed over a read-only disk. +Write requests are intercepted and stored on the writable disk. +Read requests are first checked to see if they have been written +on the top (writable disk) and if found are returned. +If they have not been written on the top disk, +then they are read from the lower disk. +.Pp +The +.Nm +utility can be especially useful if you have a large disk with a +corrupted filesystem that you are unsure of how to repair. +You can use +.Nm +to place another disk over the corrupted disk and then attempt +to repair the filesystem. +If the repair fails, you can revert all the changes in the upper disk +and be back to the unchanged state of the lower disk thus allowing you +to try another approach to repairing it. +If the repair is successful you can request that all the writes recorded +on the top disk be written to the lower disk. +.Pp +Another use of the +.Nm +utility is to try out upgrades to your system. +Place the upper disk over the disk holding your filesystem that +is to be upgraded and then run the upgrade on it. +If it works, commit it; +if it fails, revert the upgrade. +An example is given below. +.Pp +The upper disk must be at least the size of the disk that it covers. +The union metadata exists only for the +period of time that the union is instantiated, +so it is important to commit the updates before destroying the union. +If the top disk is about 2.5 percent larger for 512 byte sector disks +(or 0.5 percent larger for 4K sector disks) than the disk that it covers, +it is posible (thought not currently implemented) to save the union +metadata between instantiations of the union device. +.Pp +If you do not have physical media available to use for the upper layer, the +.Xr md 4 +disk can be used instead. +When used in +.Cm swap +mode the changes are all held in buffer memory. +Pages get pushed out to the swap when the system is under memory pressure, +otherwise they stay in the operating memory. +If long-term persistance is desired, +.Cm vnode +mode can be used in which a regular file is used as backing store. +The disk space used by the file is based on the amount of data that +is written to the top device. +.Pp +The first argument to +.Nm +indicates an action to be performed: +.Bl -tag -width "destroy" +.It Cm create +Set up a union provider on the two given devices. +The first device given is used as the top device and must be writable. +The second device given is used as the bottom device and need only be readable. +The second device may be mounted read-only but it is recommended +that it be unmounted and accessed only through a mount of the union device. +If the operation succeeds, the new provider should appear with name +.Pa /dev/ Ns Ao Ar upperdev Ac Ns - Ns Ao Ar lowerdev Ac Ns Pa .union . +An alternate name can be specified with the +.Fl Z +flag. +The kernel module +.Pa geom_union.ko +will be loaded if it is not loaded already. +.Pp +Additional options include: +.Bl -tag -width "-Z gunionname" +.It Fl o Ar offset +Where to begin on the original provider. +The default is to start at the beginning of the disk (i.e., at offset 0). +This option may be used to skip over partitioning information stored +at the beginning of a disk. +The offset must be a multiple of the sector size. +.It Fl s Ar size +Size of the transparent provider. +The default is to be the same size as the lower disk. +Any extra space at the end of the upper disk may be used to store +union metadata. +.It Fl S Ar secsize +Sector size of the transparent provider. +The default is to be the same sector size as the lower disk. +.It Fl v +Be more verbose. +.It Fl Z Ar gunionname +The name of the new provider. +The suffix +.Dq .union +will be appended to the provider name. +.El +.It Cm destroy +Turn off the given union providers. +.Pp +Additional options include: +.Bl -tag -width "-f" +.It Fl f +Force the removal of the specified provider. +.It Fl v +Be more verbose. +.El +.It Cm revert +Discard all the changes made in the top layer thus reverting to the +original state of the lower device. +.It Cm commit +Write all the changes made in the top device to the lower device +thus committing the lower device to have the same data as the union. +.Pp +Additional options include: +.Bl -tag -width "-f" +.It Fl f +The +.Cm commit +command will not allow the lower device to be mounted while the +.Cm commit +operation is being done. +However, the +.Fl f +flag may be specified to allow the lower device to be mounted read-only. +To prevent a filesystem panic on the mounted lower-device filesystem, +immediately after the +.Cm commit +operation finishes the lower-device filesystem should be unmounted +and then remounted to update its metadata state. +If the lower-device filesystem is UFS/FFS, +it is simply necessary to upgrade from read-only to read-write as the +filesystem will reload its in-kernel state as part of making that change. +.It Fl v +Be more verbose. +.El +.It Cm reset +Reset statistics for the given union providers. +.It Cm list +See +.Xr geom 8 . +.It Cm status +See +.Xr geom 8 . +.It Cm load +See +.Xr geom 8 . +.It Cm unload +See +.Xr geom 8 . +.El +.Sh EXIT STATUS +Exit status is 0 on success, and 1 if the command fails. +.Sh EXAMPLES +The following example shows how to create and destroy a +union provider with disks +.Pa /dev/da0p1 +as the read-only disk on the bottom and +.Pa /dev/md0 +as the wriable disk on the top. +.Bd -literal -offset indent +gunion create -v md0 da0p1 +mount /dev/md0-da0p1.union /mnt +.Ed +.Pp +Proceed to make changes in /mnt filesystem. +If they are successful and you want to keep them. +.Bd -literal -offset indent +gunion commit -v md0-da0p1.union +.Ed +.Pp +If they are unsuccessful and you want to roll back. +.Bd -literal -offset indent +gunion revert -v md0-da0p1.union +.Ed +.Pp +When done eliminate the union. +.Bd -literal -offset indent +gunion destroy -v md0-da0p1.union +.Ed +.Pp +All uncommitted changes will be discarded when the union is destroyed. +.Pp +If you use the name of the full disk, for example +.Pa da0 +and it is labelled, +then a union name will appear for the disk as +.Pa md0-da0.union +as well as for each partition on the disk as +.Pa md0-da0p1.union , +.Pa md0-da0p2.union , +etc. +A commit operation can be done only on +.Pa md0-da0.union +and will commit changes to all the partitions. +If partition level commits are desired, +then a union must be created for each partition. +.Pp +The traffic statistics for the given +union providers can be obtained with the +.Cm list +command. +The example below shows the number of bytes written with +.Xr newfs 8 : +.Bd -literal -offset indent +gunion create md0 da0p1 +newfs /dev/md0-da0p1.union +gunion list +.Ed +.Sh SYSCTL VARIABLES +The following +.Xr sysctl 8 +variables can be used to control the behavior of the +.Nm UNION +GEOM class. +The default value is shown next to each variable. +.Bl -tag -width indent +.It Va kern.geom.union.debug : No 0 +Debug level of the +.Nm UNION +GEOM class. +This can be set to a number between 0 and 3 inclusive. +If set to 0, no debug information is printed. +If set to 1, all the verbose messages are logged. +If set to 2, addition error-related information is logged. +If set to 3, the maximum amount of debug information is printed. +.El +.Sh SEE ALSO +.Xr geom 4 , +.Xr geom 8 +.Sh HISTORY +The +.Nm +utility appeared in +.Fx 14.0 . +.Sh AUTHORS +.An Marshall Kirk McKusick Aq Mt mckusick@mckusick.com diff --git a/sbin/geom/core/geom.8 b/sbin/geom/core/geom.8 --- a/sbin/geom/core/geom.8 +++ b/sbin/geom/core/geom.8 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd September 14, 2018 +.Dd October 30, 2021 .Dt GEOM 8 .Os .Sh NAME @@ -162,6 +162,8 @@ .It STRIPE .It +UNION +.It VIRSTOR .El .Sh ENVIRONMENT @@ -210,6 +212,7 @@ .Xr gsched 8 , .Xr gshsec 8 , .Xr gstripe 8 , +.Xr gunion 8 , .Xr gvirstor 8 .Sh HISTORY The diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -3703,6 +3703,7 @@ geom/raid3/g_raid3_ctl.c optional geom_raid3 geom/shsec/g_shsec.c optional geom_shsec geom/stripe/g_stripe.c optional geom_stripe +geom/union/g_union.c optional geom_union geom/uzip/g_uzip.c optional geom_uzip geom/uzip/g_uzip_lzma.c optional geom_uzip geom/uzip/g_uzip_wrkthr.c optional geom_uzip diff --git a/sys/geom/union/g_union.h b/sys/geom/union/g_union.h new file mode 100644 --- /dev/null +++ b/sys/geom/union/g_union.h @@ -0,0 +1,123 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Marshall Kirk McKusick + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _G_UNION_H_ +#define _G_UNION_H_ + +#define G_UNION_CLASS_NAME "UNION" +#define G_UNION_VERSION 1 +#define G_UNION_SUFFIX ".union" +/* + * Special flag to instruct gunion to passthrough the underlying provider's + * physical path + */ +#define G_UNION_PHYSPATH_PASSTHROUGH "\255" + +#ifdef _KERNEL +#define G_UNION_DEBUG(lvl, ...) \ + _GEOM_DEBUG("GEOM_UNION", g_union_debug, (lvl), NULL, __VA_ARGS__) +#define G_UNION_LOGREQLVL(lvl, bp, ...) \ + _GEOM_DEBUG("GEOM_UNION", g_union_debug, (lvl), (bp), __VA_ARGS__) +#define G_UNION_LOGREQ(bp, ...) G_UNION_LOGREQLVL(3, bp, __VA_ARGS__) + +/* + * State maintained by each instance of a UNION GEOM. + */ +struct g_union_softc { + struct rwlock sc_rwlock; /* writemap lock */ + uint64_t **sc_writemap_root; /* root of write map */ + uint64_t *sc_leafused; /* 1 => leaf has allocation */ + uint64_t sc_map_size; /* size of write map */ + long sc_root_size; /* entries in root node */ + long sc_leaf_size; /* entries in leaf node */ + long sc_bits_per_leaf; /* bits per leaf node entry */ + long sc_writemap_memory; /* memory used by writemap */ + off_t sc_offset; /* starting offset in lower */ + off_t sc_size; /* size of union geom */ + off_t sc_sectorsize; /* sector size of geom */ + struct g_consumer *sc_uppercp; /* upper-level provider */ + struct g_consumer *sc_lowercp; /* lower-level provider */ + long sc_flags; /* see flags below */ + long sc_reads; /* number of reads done */ + long sc_wrotebytes; /* number of bytes written */ + long sc_writes; /* number of writes done */ + long sc_readbytes; /* number of bytes read */ + long sc_deletes; /* number of deletes done */ + long sc_getattrs; /* number of getattrs done */ + long sc_flushes; /* number of flushes done */ + long sc_cmd0s; /* number of cmd0's done */ + long sc_cmd1s; /* number of cmd1's done */ + long sc_cmd2s; /* number of cmd2's done */ + long sc_speedups; /* number of speedups done */ +}; +/* + * UNION flags + */ +#define DOING_COMMIT 0x00000001 /* a commit command is in progress */ + +#define DOING_COMMIT_BITNUM 0 /* a commit command is in progress */ + +#define BITS_PER_ENTRY (sizeof(uint64_t) * NBBY) +#define G_RLOCK(sc) rw_rlock(&(sc)->sc_rwlock) +#define G_RUNLOCK(sc) rw_runlock(&(sc)->sc_rwlock) +#define G_WLOCK(sc) rw_wlock(&(sc)->sc_rwlock) +#define G_WUNLOCK(sc) rw_wunlock(&(sc)->sc_rwlock) + +/* + * The writelock is held while a commit operation is in progress. + * While held union device may not be used or in use. + * Returns == 0 if lock was successfully obtained. + */ +static inline int +g_union_get_writelock(struct g_union_softc *sc) +{ + + return (atomic_testandset_long(&sc->sc_flags, DOING_COMMIT_BITNUM)); +} + +static inline void +g_union_rel_writelock(struct g_union_softc *sc) +{ + long ret __diagused; + + ret = atomic_testandclear_long(&sc->sc_flags, DOING_COMMIT_BITNUM); + KASSERT(ret != 0, ("UNION GEOM releasing unheld lock")); +} + +/* + * Used to track a set of read requests within a single BIO_READ request. + */ +struct g_union_iotrack { + long io_numios; /* number of I/O operations in progress */ + long io_error; /* non-I/O errors */ +}; + +#endif /* _KERNEL */ + +#endif /* _G_UNION_H_ */ diff --git a/sys/geom/union/g_union.c b/sys/geom/union/g_union.c new file mode 100644 --- /dev/null +++ b/sys/geom/union/g_union.c @@ -0,0 +1,1239 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Marshall Kirk McKusick + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +SYSCTL_DECL(_kern_geom); +static SYSCTL_NODE(_kern_geom, OID_AUTO, union, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "GEOM_UNION stuff"); +static u_int g_union_debug = 0; +SYSCTL_UINT(_kern_geom_union, OID_AUTO, debug, CTLFLAG_RW, &g_union_debug, 0, + "Debug level"); + +static void g_union_config(struct gctl_req *req, struct g_class *mp, + const char *verb); +static g_access_t g_union_access; +static g_start_t g_union_start; +static g_dumpconf_t g_union_dumpconf; +static g_orphan_t g_union_orphan; +static int g_union_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp); +static g_provgone_t g_union_providergone; +static g_resize_t g_union_resize; + +struct g_class g_union_class = { + .name = G_UNION_CLASS_NAME, + .version = G_VERSION, + .ctlreq = g_union_config, + .access = g_union_access, + .start = g_union_start, + .dumpconf = g_union_dumpconf, + .orphan = g_union_orphan, + .destroy_geom = g_union_destroy_geom, + .providergone = g_union_providergone, + .resize = g_union_resize, +}; + +static void g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool); +static intmax_t g_union_fetcharg(struct gctl_req *req, const char *name); +static bool g_union_verify_nprefix(const char *name); +static void g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool); +static struct g_geom *g_union_find_geom(struct g_class *mp, const char *name); +static void g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool); +static void g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool); +static void g_union_revert(struct g_union_softc *sc); +static void g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool); +static void g_union_setmap(struct bio *bp, struct g_union_softc *sc); +static bool g_union_getmap(struct bio *bp, struct g_union_softc *sc, + off_t *len2read); +static void g_union_done(struct bio *bp); +static void g_union_kerneldump(struct bio *bp, struct g_union_softc *sc); +static int g_union_dumper(void *, void *, vm_offset_t, off_t, size_t); +static int g_union_destroy(struct g_geom *gp, bool force, bool verbose); + +/* + * Operate on union-specific configuration commands. + */ +static void +g_union_config(struct gctl_req *req, struct g_class *mp, const char *verb) +{ + uint32_t *version, *verbose; + + g_topology_assert(); + + version = gctl_get_paraml(req, "version", sizeof(*version)); + if (version == NULL) { + gctl_error(req, "No '%s' argument.", "version"); + return; + } + if (*version != G_UNION_VERSION) { + gctl_error(req, "Userland and kernel parts are out of sync."); + return; + } + verbose = gctl_get_paraml(req, "verbose", sizeof(*verbose)); + if (verbose == NULL) { + gctl_error(req, "No '%s' argument.", "verbose"); + return; + } + if (strcmp(verb, "create") == 0) { + g_union_ctl_create(req, mp, *verbose); + return; + } else if (strcmp(verb, "destroy") == 0) { + g_union_ctl_destroy(req, mp, *verbose); + return; + } else if (strcmp(verb, "reset") == 0) { + g_union_ctl_reset(req, mp, *verbose); + return; + } else if (strcmp(verb, "revert") == 0) { + g_union_ctl_revert(req, mp, *verbose); + return; + } else if (strcmp(verb, "commit") == 0) { + g_union_ctl_commit(req, mp, *verbose); + return; + } + + gctl_error(req, "Unknown verb."); +} + +/* + * Create a union device. + */ +static void +g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool verbose) +{ + struct g_provider *upperpp, *lowerpp, *newpp; + struct g_consumer *uppercp, *lowercp; + struct g_union_softc *sc; + struct g_geom_alias *gap; + struct g_geom *gp; + intmax_t offset, secsize, size, needed; + const char *gunionname; + int *nargs, error, i, n; + char name[64]; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument", "nargs"); + return; + } + if (*nargs < 2) { + gctl_error(req, "Missing device(s)."); + return; + } + if (*nargs > 2) { + gctl_error(req, "Extra device(s)."); + return; + } + + offset = g_union_fetcharg(req, "offset"); + size = g_union_fetcharg(req, "size"); + secsize = g_union_fetcharg(req, "secsize"); + gunionname = gctl_get_asciiparam(req, "gunionname"); + + upperpp = gctl_get_provider(req, "arg0"); + lowerpp = gctl_get_provider(req, "arg1"); + if (upperpp == NULL || lowerpp == NULL) + /* error message provided by gctl_get_provider() */ + return; + /* Create the union */ + if (secsize == 0) + secsize = lowerpp->sectorsize; + else if ((secsize % lowerpp->sectorsize) != 0) { + gctl_error(req, "Sector size %jd is not a multiple of lower " + "provider %s's %jd sector size.", (intmax_t)secsize, + lowerpp->name, (intmax_t)lowerpp->sectorsize); + return; + } + if (secsize > maxphys) { + gctl_error(req, "Too big secsize %jd for lower provider %s.", + (intmax_t)secsize, lowerpp->name); + return; + } + if (secsize % upperpp->sectorsize != 0) { + gctl_error(req, "Sector size %jd is not a multiple of upper " + "provider %s's %jd sector size.", (intmax_t)secsize, + upperpp->name, (intmax_t)upperpp->sectorsize); + return; + } + if ((offset % secsize) != 0) { + gctl_error(req, "Offset %jd is not a multiple of lower " + "provider %s's %jd sector size.", (intmax_t)offset, + lowerpp->name, (intmax_t)lowerpp->sectorsize); + return; + } + if (size == 0) + size = lowerpp->mediasize - offset; + else + size -= offset; + if ((size % secsize) != 0) { + gctl_error(req, "Size %jd is not a multiple of sector size " + "%jd.", (intmax_t)size, (intmax_t)secsize); + return; + } + if (offset + size < lowerpp->mediasize) { + gctl_error(req, "Size %jd is too small for lower provider %s, " + "needs %jd.", (intmax_t)(offset + size), lowerpp->name, + lowerpp->mediasize); + return; + } + if (size > upperpp->mediasize) { + gctl_error(req, "Upper provider %s size (%jd) is too small, " + "needs %jd.", upperpp->name, (intmax_t)upperpp->mediasize, + (intmax_t)size); + return; + } + if (gunionname != NULL && !g_union_verify_nprefix(gunionname)) { + gctl_error(req, "Gunion name %s must be alphanumeric.", + gunionname); + return; + } + if (gunionname != NULL) { + n = snprintf(name, sizeof(name), "%s%s", gunionname, + G_UNION_SUFFIX); + } else { + n = snprintf(name, sizeof(name), "%s-%s%s", upperpp->name, + lowerpp->name, G_UNION_SUFFIX); + } + if (n <= 0 || n >= sizeof(name)) { + gctl_error(req, "Invalid provider name."); + return; + } + LIST_FOREACH(gp, &mp->geom, geom) { + if (strcmp(gp->name, name) == 0) { + gctl_error(req, "Provider %s already exists.", name); + return; + } + } + gp = g_new_geomf(mp, "%s", name); + sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); + rw_init(&sc->sc_rwlock, "gunion"); + sc->sc_offset = offset; + sc->sc_size = size; + sc->sc_sectorsize = secsize; + sc->sc_reads = 0; + sc->sc_writes = 0; + sc->sc_deletes = 0; + sc->sc_getattrs = 0; + sc->sc_flushes = 0; + sc->sc_speedups = 0; + sc->sc_cmd0s = 0; + sc->sc_cmd1s = 0; + sc->sc_cmd2s = 0; + sc->sc_readbytes = 0; + sc->sc_wrotebytes = 0; + sc->sc_writemap_memory = 0; + gp->softc = sc; + + newpp = g_new_providerf(gp, "%s", gp->name); + newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; + newpp->mediasize = size; + newpp->sectorsize = secsize; + LIST_FOREACH(gap, &upperpp->aliases, ga_next) + g_provider_add_alias(newpp, "%s%s", gap->ga_alias, + G_UNION_SUFFIX); + LIST_FOREACH(gap, &lowerpp->aliases, ga_next) + g_provider_add_alias(newpp, "%s%s", gap->ga_alias, + G_UNION_SUFFIX); + lowercp = g_new_consumer(gp); + lowercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + if ((error = g_attach(lowercp, lowerpp)) != 0) { + gctl_error(req, "Error %d: cannot attach to provider %s.", + error, lowerpp->name); + goto fail1; + } + /* request read and exclusive access for lower */ + if ((error = g_access(lowercp, 1, 0, 1)) != 0) { + gctl_error(req, "Error %d: cannot obtain exclusive access to " + "%s.\n\tMust be unmounted or mounted read-only.", error, + lowerpp->name); + goto fail2; + } + uppercp = g_new_consumer(gp); + uppercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + if ((error = g_attach(uppercp, upperpp)) != 0) { + gctl_error(req, "Error %d: cannot attach to provider %s.", + error, upperpp->name); + goto fail3; + } + /* request read, write, and exclusive access for upper */ + if ((error = g_access(uppercp, 1, 1, 1)) != 0) { + gctl_error(req, "Error %d: cannot obtain write access to %s.", + error, upperpp->name); + goto fail4; + } + sc->sc_uppercp = uppercp; + sc->sc_lowercp = lowercp; + + newpp->flags |= (upperpp->flags & G_PF_ACCEPT_UNMAPPED) & + (lowerpp->flags & G_PF_ACCEPT_UNMAPPED); + g_error_provider(newpp, 0); + /* + * Allocate the map that tracks the sectors that have been written + * to the top layer. We use a 2-level hierarchy as that lets us + * map up to 1 petabyte using allocations of less than 33 Mb + * when using 4K byte sectors (or 268 Mb with 512 byte sectors). + * + * We totally populate the leaf nodes rather than allocating them + * as they are first used because their usage occurs in the + * g_union_start() routine that may be running in the g_down + * thread which cannot sleep. + */ + sc->sc_map_size = roundup(size / secsize, BITS_PER_ENTRY); + needed = sc->sc_map_size / BITS_PER_ENTRY; + for (sc->sc_root_size = 1; + sc->sc_root_size * sc->sc_root_size < needed; + sc->sc_root_size++) + continue; + sc->sc_writemap_root = g_malloc(sc->sc_root_size * sizeof(uint64_t), + M_WAITOK | M_ZERO); + sc->sc_leaf_size = sc->sc_root_size; + sc->sc_bits_per_leaf = sc->sc_leaf_size * BITS_PER_ENTRY; + sc->sc_leafused = g_malloc(roundup(sc->sc_root_size, BITS_PER_ENTRY), + M_WAITOK | M_ZERO); + for (i = 0; i < sc->sc_root_size; i++) + sc->sc_writemap_root[i] = + g_malloc(sc->sc_leaf_size * sizeof(uint64_t), + M_WAITOK | M_ZERO); + sc->sc_writemap_memory = + (sc->sc_root_size + sc->sc_root_size * sc->sc_leaf_size) * + sizeof(uint64_t) + roundup(sc->sc_root_size, BITS_PER_ENTRY); + if (verbose) + printf("Device %s created with memory map size %jd.\n", + gp->name, sc->sc_writemap_memory); + G_UNION_DEBUG(1, "Device %s created with memory map size %jd.", + gp->name, sc->sc_writemap_memory); + return; + +fail4: + g_detach(uppercp); +fail3: + g_destroy_consumer(uppercp); + g_access(lowercp, -1, 0, -1); +fail2: + g_detach(lowercp); +fail1: + g_destroy_consumer(lowercp); + g_destroy_provider(newpp); + g_destroy_geom(gp); +} + +/* + * Fetch named option and verify that it is positive. + */ +static intmax_t +g_union_fetcharg(struct gctl_req *req, const char *name) +{ + intmax_t *val; + + val = gctl_get_paraml_opt(req, name, sizeof(*val)); + if (val == NULL) + return (0); + if (*val >= 0) + return (*val); + gctl_error(req, "Invalid '%s': negative value, using default", name); + return (0); +} + +/* + * Verify that a name is alphanumeric. + */ +static bool +g_union_verify_nprefix(const char *name) +{ + int i; + + for (i = 0; i < strlen(name); i++) { + if (isalpha(name[i]) == 0 && isdigit(name[i]) == 0) { + return (false); + } + } + return (true); +} + +/* + * Destroy a union device. + */ +static void +g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool verbose) +{ + int *nargs, *force, error, i; + struct g_geom *gp; + const char *name; + char param[16]; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument", "nargs"); + return; + } + if (*nargs <= 0) { + gctl_error(req, "Missing device(s)."); + return; + } + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force == NULL) { + gctl_error(req, "No 'force' argument"); + return; + } + + for (i = 0; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%d", i); + name = gctl_get_asciiparam(req, param); + if (name == NULL) { + gctl_error(req, "No 'arg%d' argument", i); + return; + } + if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0) + name += strlen(_PATH_DEV); + gp = g_union_find_geom(mp, name); + if (gp == NULL) { + gctl_error(req, "Device %s is invalid.", name); + return; + } + if ((error = g_union_destroy(gp, *force, verbose) != 0)) { + gctl_error(req, "Error %d: cannot destroy device %s.", + error, gp->name); + return; + } + } +} + +/* + * Find a union geom. + */ +static struct g_geom * +g_union_find_geom(struct g_class *mp, const char *name) +{ + struct g_geom *gp; + + LIST_FOREACH(gp, &mp->geom, geom) { + if (strcmp(gp->name, name) == 0) + return (gp); + } + return (NULL); +} + +/* + * Zero out all the statistics associated with a union device. + */ +static void +g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool verbose) +{ + struct g_union_softc *sc; + struct g_provider *pp; + struct g_geom *gp; + char param[16]; + int i, *nargs; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument", "nargs"); + return; + } + if (*nargs <= 0) { + gctl_error(req, "Missing device(s)."); + return; + } + + for (i = 0; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%d", i); + pp = gctl_get_provider(req, param); + if (pp == NULL) + return; + gp = pp->geom; + if (gp->class != mp) { + gctl_error(req, "Provider %s is invalid.", pp->name); + return; + } + sc = gp->softc; + sc->sc_reads = 0; + sc->sc_writes = 0; + sc->sc_deletes = 0; + sc->sc_getattrs = 0; + sc->sc_flushes = 0; + sc->sc_speedups = 0; + sc->sc_cmd0s = 0; + sc->sc_cmd1s = 0; + sc->sc_cmd2s = 0; + sc->sc_readbytes = 0; + sc->sc_wrotebytes = 0; + if (verbose) + printf("Device %s has been reset.\n", gp->name); + G_UNION_DEBUG(1, "Device %s has been reset.", gp->name); + } +} + +/* + * Revert all write requests made to the top layer of the union. + */ +static void +g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool verbose) +{ + struct g_provider *pp; + struct g_geom *gp; + char param[16]; + int i, *nargs; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument", "nargs"); + return; + } + if (*nargs <= 0) { + gctl_error(req, "Missing device(s)."); + return; + } + + for (i = 0; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%d", i); + pp = gctl_get_provider(req, param); + if (pp == NULL) + return; + gp = pp->geom; + if (gp->class != mp) { + gctl_error(req, "Provider %s is invalid.", pp->name); + return; + } + g_union_revert(gp->softc); + if (verbose) + printf("Device %s has been reverted.\n", gp->name); + G_UNION_DEBUG(1, "Device %s has been reverted.", gp->name); + } +} + +/* + * Revert union writes by zero'ing out the writemap. + */ +static void +g_union_revert(struct g_union_softc *sc) +{ + int i; + + G_WLOCK(sc); + for (i = 0; i < sc->sc_root_size; i++) + memset(sc->sc_writemap_root[i], 0, + sc->sc_leaf_size * sizeof(uint64_t)); + memset(sc->sc_leafused, 0, roundup(sc->sc_root_size, BITS_PER_ENTRY)); + G_WUNLOCK(sc); +} + +/* + * Commit all the writes made in the top layer to the lower layer. + */ +static void +g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool verbose) +{ + struct g_union_softc *sc; + struct g_provider *pp, *lowerpp; + struct g_consumer *lowercp; + struct g_geom *gp; + struct bio *bp; + char param[16]; + off_t len2rd, len2wt, savelen; + int i, error, *nargs, *force; + + g_topology_assert(); + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument", "nargs"); + return; + } + if (*nargs <= 0) { + gctl_error(req, "Missing device(s)."); + return; + } + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force == NULL) { + gctl_error(req, "No 'force' argument"); + return; + } + + /* Get a bio buffer to do our I/O */ + bp = g_alloc_bio(); + bp->bio_data = g_malloc(MAXBSIZE, M_WAITOK); + bp->bio_done = biodone; + for (i = 0; i < *nargs; i++) { + snprintf(param, sizeof(param), "arg%d", i); + pp = gctl_get_provider(req, param); + if (pp == NULL) + continue; + gp = pp->geom; + if (gp->class != mp) { + gctl_error(req, "Provider %s is invalid.", pp->name); + continue; + } + sc = gp->softc; + if (g_union_get_writelock(sc) != 0) { + gctl_error(req, "Commit already in progress for " + "provider %s.", pp->name); + continue; + } + + /* upgrade to write access for lower */ + lowercp = sc->sc_lowercp; + lowerpp = lowercp->provider; + /* + * No mount or other use of union is allowed, unless the + * -f flag is given which allows read-only mount or usage. + */ + if ((*force == false && pp->acr > 0) || pp->acw > 0 || + pp->ace > 0) { + gctl_error(req, "Unable to get exclusive access for " + "writing of %s.\n\tNote that %s cannot be mounted " + "or otherwise\n\topen during a commit unless the " + "-f flag is used.", pp->name, pp->name); + g_union_rel_writelock(sc); + continue; + } + /* + * No mount or other use of lower media is allowed, unless the + * -f flag is given which allows read-only mount or usage. + */ + if ((*force == false && lowerpp->acr > lowercp->acr) || + lowerpp->acw > lowercp->acw || + lowerpp->ace > lowercp->ace) { + gctl_error(req, "provider %s is unable to get " + "exclusive access to %s\n\tfor writing. Note that " + "%s cannot be mounted or otherwise open\n\tduring " + "a commit unless the -f flag is used.", pp->name, + lowerpp->name, lowerpp->name); + g_union_rel_writelock(sc); + continue; + } + if ((error = g_access(lowercp, 0, 1, 0)) != 0) { + gctl_error(req, "Error %d: provider %s is unable to " + "access %s for writing.", error, pp->name, + lowerpp->name); + g_union_rel_writelock(sc); + continue; + } + /* Loop over write map copying across written blocks */ + bp->bio_offset = 0; + bp->bio_length = sc->sc_map_size * sc->sc_sectorsize; + G_RLOCK(sc); + while (bp->bio_length > 0) { + if (!g_union_getmap(bp, sc, &len2rd)) { + /* not written, so skip */ + bp->bio_offset += len2rd; + bp->bio_length -= len2rd; + continue; + } + G_RUNLOCK(sc); + /* need to read then write len2rd sectors */ + for ( ; len2rd > 0; len2rd -= len2wt) { + /* limit ourselves to MAXBSIZE size I/Os */ + len2wt = len2rd; + if (len2wt > MAXBSIZE) + len2wt = MAXBSIZE; + savelen = bp->bio_length; + bp->bio_length = len2wt; + bp->bio_cmd = BIO_READ; + g_io_request(bp, sc->sc_uppercp); + if ((error = biowait(bp, "rdunion")) != 0) { + gctl_error(req, "Commit read error %d " + "in provider %s, commit aborted.", + error, pp->name); + goto cleanup; + } + bp->bio_flags &= ~BIO_DONE; + bp->bio_cmd = BIO_WRITE; + g_io_request(bp, lowercp); + if ((error = biowait(bp, "wtunion")) != 0) { + gctl_error(req, "Commit write error %d " + "in provider %s, commit aborted.", + error, pp->name); + goto cleanup; + } + bp->bio_flags &= ~BIO_DONE; + bp->bio_offset += len2wt; + bp->bio_length = savelen - len2wt; + } + G_RLOCK(sc); + } + G_RUNLOCK(sc); + /* clear the write map */ + g_union_revert(sc); +cleanup: + /* return lower to previous access */ + if ((error = g_access(lowercp, 0, -1, 0)) != 0) { + G_UNION_DEBUG(2, "Error %d: device %s could not reset " + "access to %s (r=0 w=-1 e=0).", error, pp->name, + lowerpp->name); + } + g_union_rel_writelock(sc); + if (verbose) + printf("Device %s has been committed.\n", gp->name); + G_UNION_DEBUG(1, "Device %s has been committed.", gp->name); + } + g_free(bp->bio_data); + g_destroy_bio(bp); +} + +/* + * Generally allow access unless a commit is in progress. + */ +static int +g_union_access(struct g_provider *pp, int r, int w, int e) +{ + struct g_union_softc *sc; + + sc = pp->geom->softc; + if (sc == NULL) { + if (r <= 0 && w <= 0 && e <= 0) + return (0); + return (ENXIO); + } + r += pp->acr; + w += pp->acw; + e += pp->ace; + if (g_union_get_writelock(sc) != 0) { + if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) + return (0); + return (EBUSY); + } + g_union_rel_writelock(sc); + return (0); +} + +/* + * Initiate an I/O operation on the union device. + */ +static void +g_union_start(struct bio *bp) +{ + struct g_union_softc *sc; + struct g_consumer *cp, *firstcp; + struct bio *cbp, *firstbp; + struct g_union_iotrack *iotrackp; + off_t rdlen, len2rd, offset; + char *level; + int iocnt; + + cbp = g_clone_bio(bp); + if (cbp == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + sc = bp->bio_to->geom->softc; + cbp->bio_offset += sc->sc_offset; + cbp->bio_done = g_std_done; + + switch (bp->bio_cmd) { + case BIO_READ: + /* + * The usual read case is that we either read the top layer + * if the block has been previously written or the bottom layer + * if it has not been written. However, it is possible that + * only part of the block has been written, For example we may + * have written a UFS/FFS file fragment comprising several + * sectors out of an 8-sector block. Here, if the entire + * 8-sector block is read for example by a snapshot needing + * to copy the full block, then we need to read the written + * sectors from the upper level and the unwritten sectors from + * the lower level. We do this by alternately reading from the + * top and bottom layers until we complete the read. As + * requests for partially written blocks are uncommon, we + * make no attempt to optimize the code. + */ + atomic_add_long(&sc->sc_reads, 1); + atomic_add_long(&sc->sc_readbytes, bp->bio_length); + rdlen = cbp->bio_length; + G_RLOCK(sc); + for (iocnt = 0; ; iocnt++) { + if (g_union_getmap(cbp, sc, &len2rd)) { + /* read top */ + cp = sc->sc_uppercp; + level = "upper"; + } else { + /* read bottom */ + cp = sc->sc_lowercp; + level = "lower"; + } + /* Check if only a single read is required */ + if (iocnt == 0 && rdlen == len2rd) { + G_UNION_LOGREQ(cbp, "Sending %jd byte read " + "request to %s level.", len2rd, level); + g_io_request(cbp, cp); + G_RUNLOCK(sc); + return; + } + if (iocnt == 0) { + iotrackp = g_malloc(sizeof(*iotrackp), + M_NOWAIT | M_ZERO); + if (iotrackp == NULL) { + cbp->bio_error = ENOMEM; + g_std_done(cbp); + G_RUNLOCK(sc); + return; + } + iotrackp->io_numios = 1; + } else { + atomic_add_long(&iotrackp->io_numios, 1); + } + cbp->bio_done = g_union_done; + cbp->bio_caller1 = iotrackp; + cbp->bio_length = len2rd; + offset = cbp->bio_offset + len2rd; + rdlen -= len2rd; + G_UNION_LOGREQ(cbp, "Sending %jd byte read " + "request to %s level.", len2rd, level); + /* + * To avoid prematurely notifying our consumer + * that their I/O has completed, we have to delay + * issuing our first I/O request until we have + * issued all the additional I/O requests. + */ + if (iocnt > 0) { + g_io_request(cbp, cp); + } else { + firstbp = cbp; + firstcp = cp; + } + if (rdlen == len2rd) + break; + /* set up for next read */ + cbp = g_clone_bio(bp); + if (cbp == NULL) { + iotrackp->io_error = ENOMEM; + atomic_add_long(&iotrackp->io_numios, -1); + break; + } + cbp->bio_offset = offset; + cbp->bio_length = rdlen; + atomic_add_long(&sc->sc_reads, 1); + } + /* We have issued all our I/O, so start the first one */ + g_io_request(firstbp, firstcp); + G_RUNLOCK(sc); + return; + case BIO_WRITE: + /* + * Writes are always done to the top level. Here, we + * record the blocks that we are writing. + */ + G_UNION_LOGREQ(cbp, "Sending %jd byte write request to upper " + "level.", cbp->bio_length); + atomic_add_long(&sc->sc_writes, 1); + atomic_add_long(&sc->sc_wrotebytes, bp->bio_length); + g_union_setmap(cbp, sc); + break; + case BIO_DELETE: + G_UNION_LOGREQ(bp, "Delete request received."); + atomic_add_long(&sc->sc_deletes, 1); + break; + case BIO_GETATTR: + G_UNION_LOGREQ(bp, "Getattr request received."); + atomic_add_long(&sc->sc_getattrs, 1); + if (strcmp(bp->bio_attribute, "GEOM::kerneldump") != 0) + /* forward the GETATTR to the lower-level device */ + break; + g_union_kerneldump(bp, sc); + return; + case BIO_FLUSH: + G_UNION_LOGREQ(bp, "Flush request received."); + atomic_add_long(&sc->sc_flushes, 1); + break; + case BIO_SPEEDUP: + G_UNION_LOGREQ(bp, "Speedup request received."); + atomic_add_long(&sc->sc_speedups, 1); + break; + case BIO_CMD0: + G_UNION_LOGREQ(bp, "Cmd0 request received."); + atomic_add_long(&sc->sc_cmd0s, 1); + break; + case BIO_CMD1: + G_UNION_LOGREQ(bp, "Cmd1 request received."); + atomic_add_long(&sc->sc_cmd1s, 1); + break; + case BIO_CMD2: + G_UNION_LOGREQ(bp, "Cmd2 request received."); + atomic_add_long(&sc->sc_cmd2s, 1); + break; + } + /* + * All commands other than write are passed through to the + * upper-level device since it is writable and thus able to + * respond to delete, flush, and speedup requests. + */ + g_io_request(cbp, sc->sc_uppercp); +} + +/* + * Used only when completing a BIO_READ operation. + */ +static void +g_union_done(struct bio *bp) +{ + struct g_union_iotrack *iotrackp; + + iotrackp = bp->bio_caller1; + if (iotrackp->io_error != 0 && bp->bio_error == 0) + bp->bio_error = iotrackp->io_error; + iotrackp->io_error = 0; + if (atomic_fetchadd_long(&iotrackp->io_numios, -1) == 0) + g_free(iotrackp); + g_std_done(bp); +} + +/* + * Record blocks that have been written in the map. + */ +static void +g_union_setmap(struct bio *bp, struct g_union_softc *sc) +{ + size_t root_idx; + uint64_t **leaf; + uint64_t *wordp; + off_t start, numsec; + + KASSERT(bp->bio_offset % sc->sc_sectorsize == 0, + ("g_union_setmap: offset not on sector boundry")); + KASSERT(bp->bio_length % sc->sc_sectorsize == 0, + ("g_union_setmap: length not a multiple of sectors")); + start = bp->bio_offset / sc->sc_sectorsize; + numsec = bp->bio_length / sc->sc_sectorsize; + KASSERT(start + numsec < sc->sc_map_size, + ("g_union_setmap: block %jd is out of range", start + numsec)); + G_WLOCK(sc); + for ( ; numsec > 0; numsec--, start++) { + root_idx = start / sc->sc_bits_per_leaf; + leaf = &sc->sc_writemap_root[root_idx]; + wordp = &(*leaf) + [(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY]; + *wordp |= 1ULL << (start % BITS_PER_ENTRY); + sc->sc_leafused[root_idx / BITS_PER_ENTRY] |= + 1 << (root_idx % BITS_PER_ENTRY); + } + G_WUNLOCK(sc); +} + +/* + * Check map to determine whether blocks have been written. + * + * Return true if they have been written so should be read from the top + * layer. Return false if they have not been written so should be read + * from the bottom layer. Return in len2read the bytes to be read. See + * the comment above the BIO_READ implementation in g_union_start() for + * an explantion of why len2read may be shorter than the buffer length. + */ +static bool +g_union_getmap(struct bio *bp, struct g_union_softc *sc, off_t *len2read) +{ + off_t start, numsec, leafresid, bitloc; + bool first, maptype, retval; + uint64_t *leaf, word; + size_t root_idx; + + KASSERT(bp->bio_offset % sc->sc_sectorsize == 0, + ("g_union_getmap: offset not on sector boundry")); + KASSERT(bp->bio_length % sc->sc_sectorsize == 0, + ("g_union_getmap: length not a multiple of sectors")); + start = bp->bio_offset / sc->sc_sectorsize; + numsec = bp->bio_length / sc->sc_sectorsize; + G_UNION_DEBUG(3, "g_union_getmap: check %jd sectors starting at %jd\n", + numsec, start); + KASSERT(start + numsec <= sc->sc_map_size, + ("g_union_getmap: block %jd is out of range", start + numsec)); + root_idx = start / sc->sc_bits_per_leaf; + first = true; + while (numsec > 0) { + /* Check first if the leaf records any written sectors */ + root_idx = start / sc->sc_bits_per_leaf; + leafresid = sc->sc_bits_per_leaf - + (start % sc->sc_bits_per_leaf); + if (((sc->sc_leafused[root_idx / BITS_PER_ENTRY]) & + (1ULL << (root_idx % BITS_PER_ENTRY))) == 0) { + if (first) { + maptype = false; + first = false; + } + if (maptype) + break; + numsec -= leafresid; + start += leafresid; + continue; + } + /* Check up to a word boundry, then check word by word */ + leaf = sc->sc_writemap_root[root_idx]; + word = leaf[(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY]; + bitloc = start % BITS_PER_ENTRY; + if (bitloc == 0 && (word == 0 || word == ~0)) { + if (first) { + if (word == 0) + maptype = false; + else + maptype = true; + first = false; + } + if ((word == 0 && maptype) || + (word == ~0 && !maptype)) + break; + numsec -= BITS_PER_ENTRY; + start += BITS_PER_ENTRY; + continue; + } + for ( ; bitloc < BITS_PER_ENTRY; bitloc ++) { + retval = (word & (1ULL << bitloc)) != 0; + if (first) { + maptype = retval; + first = false; + } + if (maptype == retval) { + numsec--; + start++; + continue; + } + goto out; + } + } +out: + if (numsec < 0) + numsec = 0; + *len2read = bp->bio_length - (numsec * sc->sc_sectorsize); + G_UNION_DEBUG(3, "g_union_getmap: return maptype %swritten for %jd " + "sectors ending at %jd\n", maptype ? "" : "NOT ", + *len2read / sc->sc_sectorsize, start - 1); + return (maptype); +} + +/* + * Fill in details for a BIO_GETATTR request. + */ +static void +g_union_kerneldump(struct bio *bp, struct g_union_softc *sc) +{ + struct g_kerneldump *gkd; + struct g_geom *gp; + struct g_provider *pp; + + gkd = (struct g_kerneldump *)bp->bio_data; + gp = bp->bio_to->geom; + g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name, + (intmax_t)gkd->offset, (intmax_t)gkd->length); + + pp = LIST_FIRST(&gp->provider); + + gkd->di.dumper = g_union_dumper; + gkd->di.priv = sc; + gkd->di.blocksize = pp->sectorsize; + gkd->di.maxiosize = DFLTPHYS; + gkd->di.mediaoffset = sc->sc_offset + gkd->offset; + if (gkd->offset > sc->sc_size) { + g_io_deliver(bp, ENODEV); + return; + } + if (gkd->offset + gkd->length > sc->sc_size) + gkd->length = sc->sc_size - gkd->offset; + gkd->di.mediasize = gkd->length; + g_io_deliver(bp, 0); +} + +/* + * Handler for g_union_kerneldump(). + */ +static int +g_union_dumper(void *priv, void *virtual, vm_offset_t physical, off_t offset, + size_t length) +{ + + return (0); +} + +/* + * List union statistics. + */ +static void +g_union_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp, struct g_provider *pp) +{ + struct g_union_softc *sc; + + if (pp != NULL || cp != NULL || gp->softc == NULL) + return; + sc = gp->softc; + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_reads); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_writes); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_deletes); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_getattrs); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_flushes); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_speedups); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd0s); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd1s); + sbuf_printf(sb, "%s%ju\n", indent, sc->sc_cmd2s); + sbuf_printf(sb, "%s%ju\n", indent, + sc->sc_readbytes); + sbuf_printf(sb, "%s%ju\n", indent, + sc->sc_wrotebytes); + sbuf_printf(sb, "%s%ju\n", indent, + sc->sc_wrotebytes); + sbuf_printf(sb, "%s%jd\n", indent, + (intmax_t)sc->sc_offset); +} + +/* + * Clean up an orphaned geom. + */ +static void +g_union_orphan(struct g_consumer *cp) +{ + + g_topology_assert(); + g_union_destroy(cp->geom, 1, false); +} + +/* + * Clean up a union geom. + */ +static int +g_union_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp) +{ + + return (g_union_destroy(gp, 0, false)); +} + +/* + * Clean up a union device. + */ +static int +g_union_destroy(struct g_geom *gp, bool force, bool verbose) +{ + struct g_union_softc *sc; + struct g_provider *pp; + int error; + + g_topology_assert(); + sc = gp->softc; + if (sc == NULL) + return (ENXIO); + pp = LIST_FIRST(&gp->provider); + if ((sc->sc_flags & DOING_COMMIT) != 0 || + (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0))) { + if (force) { + if (verbose) + printf("Device %s is still in use, so " + "is being forcibly removed.\n", pp->name); + G_UNION_DEBUG(1, "Device %s is still in use, so " + "is being forcibly removed.", pp->name); + } else { + if (verbose) + printf("Device %s is still open " + "(r=%d w=%d e=%d).\n", pp->name, pp->acr, + pp->acw, pp->ace); + G_UNION_DEBUG(1, "Device %s is still open " + "(r=%d w=%d e=%d).", pp->name, pp->acr, + pp->acw, pp->ace); + return (EBUSY); + } + } else { + if (verbose) + printf("Device %s removed.\n", gp->name); + G_UNION_DEBUG(1, "Device %s removed.", gp->name); + } + /* Close consumers */ + if ((error = g_access(sc->sc_lowercp, -1, 0, -1)) != 0) + G_UNION_DEBUG(2, "Error %d: device %s could not reset access " + "to %s.", error, pp->name, sc->sc_lowercp->provider->name); + if ((error = g_access(sc->sc_uppercp, -1, -1, -1)) != 0) + G_UNION_DEBUG(2, "Error %d: device %s could not reset access " + "to %s.", error, pp->name, sc->sc_uppercp->provider->name); + + g_wither_geom(gp, ENXIO); + + return (0); +} + +/* + * Clean up a union provider. + */ +static void +g_union_providergone(struct g_provider *pp) +{ + struct g_geom *gp; + struct g_union_softc *sc; + size_t i; + + gp = pp->geom; + sc = gp->softc; + gp->softc = NULL; + for (i = 0; i < sc->sc_root_size; i++) + g_free(sc->sc_writemap_root[i]); + g_free(sc->sc_writemap_root); + rw_destroy(&sc->sc_rwlock); + g_free(sc); +} + +/* + * Respond to a resized provider. + */ +static void +g_union_resize(struct g_consumer *cp) +{ + struct g_union_softc *sc; + struct g_geom *gp; + + g_topology_assert(); + + gp = cp->geom; + sc = gp->softc; + + /* + * If size has gotten bigger, ignore it and just keep using + * the space we already had. Otherwise we are done. + */ + if (sc->sc_size < cp->provider->mediasize - sc->sc_offset) + return; + g_union_destroy(gp, 1, false); +} + +DECLARE_GEOM_CLASS(g_union_class, g_union); +MODULE_VERSION(geom_union, 0); diff --git a/sys/modules/geom/Makefile b/sys/modules/geom/Makefile --- a/sys/modules/geom/Makefile +++ b/sys/modules/geom/Makefile @@ -21,6 +21,7 @@ geom_raid3 \ geom_shsec \ geom_stripe \ + geom_union \ geom_uzip \ geom_vinum \ geom_virstor \ diff --git a/sys/modules/geom/geom_union/Makefile b/sys/modules/geom/geom_union/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/geom/geom_union/Makefile @@ -0,0 +1,8 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/geom/union + +KMOD= geom_union +SRCS= g_union.c + +.include