diff --git a/sbin/mdconfig/mdconfig.8 b/sbin/mdconfig/mdconfig.8 index 6d9da8159723..775823fd0bc7 100644 --- a/sbin/mdconfig/mdconfig.8 +++ b/sbin/mdconfig/mdconfig.8 @@ -1,359 +1,390 @@ .\" Copyright (c) 1993 University of Utah. .\" Copyright (c) 1980, 1989, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" Copyright (c) 2000 .\" Poul-Henning Kamp All rights reserved. .\" .\" This code is derived from software contributed to Berkeley by .\" the Systems Programming Group of the University of Utah Computer .\" Science Department. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)vnconfig.8 8.1 (Berkeley) 6/5/93 .\" from: src/usr.sbin/vnconfig/vnconfig.8,v 1.19 2000/12/27 15:30:29 .\" .\" $FreeBSD$ .\" -.Dd November 6, 2020 +.Dd August 27, 2021 .Dt MDCONFIG 8 .Os .Sh NAME .Nm mdconfig .Nd create and control memory disks .Sh SYNOPSIS .Nm .Fl a .Fl t Ar type .Op Fl n .Oo Fl o Oo Cm no Oc Ns Ar option Oc ... .Op Fl f Ar file .Op Fl s Ar size .Op Fl S Ar sectorsize .Op Fl u Ar unit .Op Fl x Ar sectors/track .Op Fl y Ar heads/cylinder .Op Fl L Ar label .Nm .Fl d .Fl u Ar unit .Op Fl o Oo Cm no Oc Ns Ar force .Nm .Fl r .Fl u Ar unit .Fl s Ar size .Op Fl o Oo Cm no Oc Ns Ar force .Nm .Fl l .Op Fl n .Op Fl v .Op Fl f Ar file .Op Fl u Ar unit .Nm .Ar file .Sh DESCRIPTION The .Nm utility creates and controls .Xr md 4 devices. .Pp Options indicate an action to be performed: .Bl -tag -width indent .It Fl a Attach a memory disk. This will configure and attach a memory disk with the parameters specified and attach it to the system. If the .Fl u Ar unit option is not provided, the newly created device name will be printed on stdout. .It Fl d Detach a memory disk from the system and release all resources. .It Fl r Resize a memory disk. .It Fl t Ar type Select the type of the memory disk. .Bl -tag -width "malloc" .It Cm malloc Storage for this type of memory disk is allocated with .Xr malloc 9 . This limits the size to the malloc bucket limit in the kernel. If the .Fl o Cm reserve option is not set, creating and filling a large malloc-backed memory disk is a very easy way to panic the system. .It Cm vnode A file specified with .Fl f Ar file becomes the backing store for this memory disk. .It Cm swap Storage for this type of memory disk is allocated from buffer memory. Pages get pushed out to swap when the system is under memory pressure, otherwise they stay in the operating memory. Using .Cm swap backing is generally preferred instead of using .Cm malloc backing. .It Cm null Bitsink; all writes do nothing, all reads return zeroes. .El .It Fl f Ar file Filename to use for the vnode type memory disk. The .Fl a and .Fl t Cm vnode options are implied if not specified. .It Fl l List configured devices. If given with .Fl u , display details about that particular device. If given with .Fl f Ar file , display .Xr md 4 device names of which .Ar file is used as the backing store. If both of .Fl u and .Fl f options are specified, display devices which match the two conditions. If the .Fl v option is specified, show all details. .It Fl n When printing .Xr md 4 device names, print only the unit number without the .Xr md 4 prefix. .It Fl s Ar size Size of the memory disk. .Ar Size is the number of 512 byte sectors unless suffixed with a .Cm b , k , m , g , t , or .Cm p which denotes byte, kilobyte, megabyte, gigabyte, terabyte and petabyte respectively. When used without the .Fl r option, the .Fl a and .Fl t Cm swap options are implied if not specified. .It Fl S Ar sectorsize Sectorsize to use for the memory disk, in bytes. .It Fl x Ar sectors/track See the description of the .Fl y option below. .It Fl y Ar heads/cylinder For .Cm malloc or .Cm vnode backed devices, the .Fl x and .Fl y options can be used to specify a synthetic geometry. This is useful for constructing bootable images for later download to other devices. .It Fl L Ar label Associate a label (arbitrary string) with the new memory disk. The label can then be inspected with .Bd -literal -offset indent .Nm Fl l v .Ed .It Fl o Oo Cm no Oc Ns Ar option Set or reset options. .Bl -tag -width indent .It Oo Cm no Oc Ns Cm async For .Cm vnode backed devices: avoid .Dv IO_SYNC for increased performance but at the risk of deadlocking the entire kernel. .It Oo Cm no Oc Ns Cm cache For .Cm vnode backed devices: enable/disable caching of data in system caches. The default is to not cache. .Pp Accesses via the device are converted to accesses via the vnode. The caching policy for the vnode is used initially. This is normally to cache. This caching policy is retained if the .Cm cache option is used. Otherwise, caching is limited by releasing data from caches soon after each access. The release has the same semantics as the .Dv POSIX_FADV_DONTNEED feature of .Xr posix_fadvise 2 . The result is that with normal (non-zfs) caching, buffers are released from the buffer cache soon after they are constructed, but their data is kept in the page cache at lower priority. .Pp The .Cm cache option tends to waste memory by giving unwanted double caching, but it saves time if there is memory to spare. .It Oo Cm no Oc Ns Cm reserve Allocate and reserve all needed storage from the start, rather than as needed. .It Oo Cm no Oc Ns Cm cluster Enable clustering on this disk. .It Oo Cm no Oc Ns Cm compress Enable/disable compression features to reduce memory usage. .It Oo Cm no Oc Ns Cm force Disable/enable extra sanity checks to prevent the user from doing something that might adversely affect the system. This can be used with the .Fl d flag to forcibly destroy an .Xr md 4 disk that is still in use. +.It Oo Cm no Oc Ns Cm mustdealloc +For +.Cm vnode +backed devices: detect whether hole-punching is supported by the underlying file +system. +If the file system supports hole-punching, then to handle a +.Dv BIO_DELETE +request, some or all of the request's operation range may be turned into a hole +in the file used for backing store. +Any parts which are not turned into holes are zero-filled in +the file. +If the file system does not support +hole-punching, +.Dv BIO_DELETE +requests to the device are not handled and will fail with +.Er EOPNOTSUPP . +.Pp +When +.Cm mustdealloc +is not specified or +.Oo Cm no Oc Ns Cm mustdealloc +is specified, for a +.Dv BIO_DELETE +request, if the file system supports hole-punching, some or all of the request's +operation range may be turned into a hole in the file used for backing store. +Any parts which are not turned into holes are zero-filled in the file. +If the file system of the vnode type memory disk does not support hole-punching, +the request's operation range is zero-filled in the file. .It Oo Cm no Oc Ns Cm readonly Enable/disable readonly mode. .It Oo Cm no Oc Ns Cm verify For .Cm vnode backed devices: enable/disable requesting verification of the file used for backing store. The type of verification depends on which security features are available. One example of verification is testing file integrity with checksums or cryptographic signatures. .El .It Fl u Ar unit Request a specific unit number or device name for the .Xr md 4 device instead of automatic allocation. If a device name is specified, it must start with .Dq md followed by the unit number. .El .Pp The last form, .Nm .Ar file , is provided for convenience as an abbreviation of .Nm .Fl a .Fl t Cm vnode .Fl f Ar file . .Sh EXAMPLES Create a disk with .Pa /tmp/boot.flp as backing storage. The name of the allocated unit will be printed on stdout, such as .Dq Li md0 : .Bd -literal -offset indent mdconfig /tmp/boot.flp .Ed .Pp Create a 1 gigabyte swap backed memory disk named .Dq Li md3 : .Bd -literal -offset indent mdconfig -s 1g -u md3 .Ed .Pp Detach and free all resources used by .Pa /dev/md3 : .Bd -literal -offset indent mdconfig -du md3 .Ed .Pp Show detailed information on current memory disks: .Bd -literal -offset indent mdconfig -lv .Ed .Pp Resize the .Dq Li md3 memory disk to 2 gigabytes: .Bd -literal -offset indent mdconfig -rs 2g -u md3 .Ed .Pp Create a 1 gigabyte swap backed disk, initialize an .Xr ffs 7 file system on it, and mount it on .Pa /tmp : .Bd -literal -offset indent mdconfig -s 1g -u md10 newfs -U /dev/md10 mount /dev/md10 /tmp chmod 1777 /tmp .Ed .Pp Create a memory disk out of an ISO 9660 CD image file, using the first available .Xr md 4 device, and then mount it: .Bd -literal -offset indent mount -t cd9660 /dev/`mdconfig -f cdimage.iso` /mnt .Ed .Pp Create a file-backed device from a hard disk image that begins with 512K of raw header information. .Xr gnop 8 is used to skip over the header information, positioning .Pa md1.nop to the start of the filesystem in the image. .Bd -literal -offset indent mdconfig -u md1 -f diskimage.img gnop create -o 512K md1 mount /dev/md1.nop /mnt .Ed .Sh SEE ALSO +.Xr fpathconf 2 , +.Xr fspacectl 2 , .Xr open 2 , .Xr md 4 , .Xr ffs 7 , .Xr gpart 8 , .Xr mdmfs 8 , -.Xr malloc 9 +.Xr malloc 9 , +.Xr vn_deallocate 9 .Sh HISTORY The .Nm utility first appeared in .Fx 5.0 as a cleaner replacement for the vn kernel module and the vnconfig utility combo. .Sh AUTHORS The .Nm utility was written by .An Poul-Henning Kamp Aq Mt phk@FreeBSD.org . diff --git a/sbin/mdconfig/mdconfig.c b/sbin/mdconfig/mdconfig.c index 0f76ca6149f1..9f4f3fc3a392 100644 --- a/sbin/mdconfig/mdconfig.c +++ b/sbin/mdconfig/mdconfig.c @@ -1,597 +1,602 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2004 Poul-Henning Kamp * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct md_ioctl mdio; static enum {UNSET, ATTACH, DETACH, RESIZE, LIST} action = UNSET; static int nflag; static void usage(void); static void md_set_file(const char *); static int md_find(const char *, const char *); static int md_query(const char *, const int, const char *); static int md_list(const char *, int, const char *); static char *geom_config_get(struct gconf *g, const char *name); static void md_prthumanval(char *length); #define OPT_VERBOSE 0x01 #define OPT_UNIT 0x02 #define OPT_DONE 0x04 #define OPT_LIST 0x10 #define CLASS_NAME_MD "MD" static void usage(void) { fprintf(stderr, "usage: mdconfig -a -t type [-n] [-o [no]option] ... [-f file]\n" " [-s size] [-S sectorsize] [-u unit] [-L label]\n" " [-x sectors/track] [-y heads/cylinder]\n" " mdconfig -d -u unit [-o [no]force]\n" " mdconfig -r -u unit -s size [-o [no]force]\n" " mdconfig -l [-v] [-n] [-f file] [-u unit]\n" " mdconfig file\n"); fprintf(stderr, "\t\ttype = {malloc, vnode, swap}\n"); fprintf(stderr, "\t\toption = {cache, cluster, compress, force,\n"); - fprintf(stderr, "\t\t readonly, reserve, ro, verify}\n"); + fprintf(stderr, "\t\t mustdealloc, readonly, reserve, ro,\n"); + fprintf(stderr, "\t\t verify}\n"); fprintf(stderr, "\t\tsize = %%d (512 byte blocks), %%db (B),\n"); fprintf(stderr, "\t\t %%dk (kB), %%dm (MB), %%dg (GB), \n"); fprintf(stderr, "\t\t %%dt (TB), or %%dp (PB)\n"); exit(1); } int main(int argc, char **argv) { int ch, fd, i, vflag; char *p; char *fflag = NULL, *sflag = NULL, *tflag = NULL, *uflag = NULL; bzero(&mdio, sizeof(mdio)); mdio.md_file = malloc(PATH_MAX); mdio.md_label = malloc(PATH_MAX); if (mdio.md_file == NULL || mdio.md_label == NULL) err(1, "could not allocate memory"); vflag = 0; bzero(mdio.md_file, PATH_MAX); bzero(mdio.md_label, PATH_MAX); if (argc == 1) usage(); while ((ch = getopt(argc, argv, "ab:df:lno:rs:S:t:u:vx:y:L:")) != -1) { switch (ch) { case 'a': if (action != UNSET && action != ATTACH) errx(1, "-a is mutually exclusive " "with -d, -r, and -l"); action = ATTACH; break; case 'd': if (action != UNSET && action != DETACH) errx(1, "-d is mutually exclusive " "with -a, -r, and -l"); action = DETACH; mdio.md_options |= MD_AUTOUNIT; break; case 'r': if (action != UNSET && action != RESIZE) errx(1, "-r is mutually exclusive " "with -a, -d, and -l"); action = RESIZE; mdio.md_options |= MD_AUTOUNIT; break; case 'l': if (action != UNSET && action != LIST) errx(1, "-l is mutually exclusive " "with -a, -r, and -d"); action = LIST; mdio.md_options |= MD_AUTOUNIT; break; case 'n': nflag = 1; break; case 't': if (tflag != NULL) errx(1, "-t can be passed only once"); tflag = optarg; if (!strcmp(optarg, "malloc")) { mdio.md_type = MD_MALLOC; mdio.md_options |= MD_AUTOUNIT | MD_COMPRESS; } else if (!strcmp(optarg, "vnode")) { mdio.md_type = MD_VNODE; mdio.md_options |= MD_CLUSTER | MD_AUTOUNIT | MD_COMPRESS; } else if (!strcmp(optarg, "swap")) { mdio.md_type = MD_SWAP; mdio.md_options |= MD_CLUSTER | MD_AUTOUNIT | MD_COMPRESS; } else if (!strcmp(optarg, "null")) { mdio.md_type = MD_NULL; mdio.md_options |= MD_CLUSTER | MD_AUTOUNIT | MD_COMPRESS; } else errx(1, "unknown type: %s", optarg); break; case 'f': if (fflag != NULL) errx(1, "-f can be passed only once"); fflag = realpath(optarg, NULL); if (fflag == NULL) err(1, "realpath"); break; case 'o': if (!strcmp(optarg, "async")) mdio.md_options |= MD_ASYNC; else if (!strcmp(optarg, "noasync")) mdio.md_options &= ~MD_ASYNC; else if (!strcmp(optarg, "cache")) mdio.md_options |= MD_CACHE; else if (!strcmp(optarg, "nocache")) mdio.md_options &= ~MD_CACHE; else if (!strcmp(optarg, "cluster")) mdio.md_options |= MD_CLUSTER; else if (!strcmp(optarg, "nocluster")) mdio.md_options &= ~MD_CLUSTER; else if (!strcmp(optarg, "compress")) mdio.md_options |= MD_COMPRESS; else if (!strcmp(optarg, "nocompress")) mdio.md_options &= ~MD_COMPRESS; else if (!strcmp(optarg, "force")) mdio.md_options |= MD_FORCE; else if (!strcmp(optarg, "noforce")) mdio.md_options &= ~MD_FORCE; + else if (!strcmp(optarg, "mustdealloc")) + mdio.md_options |= MD_MUSTDEALLOC; + else if (!strcmp(optarg, "nomustdealloc")) + mdio.md_options &= ~MD_MUSTDEALLOC; else if (!strcmp(optarg, "readonly")) mdio.md_options |= MD_READONLY; else if (!strcmp(optarg, "noreadonly")) mdio.md_options &= ~MD_READONLY; else if (!strcmp(optarg, "ro")) mdio.md_options |= MD_READONLY; else if (!strcmp(optarg, "noro")) mdio.md_options &= ~MD_READONLY; else if (!strcmp(optarg, "reserve")) mdio.md_options |= MD_RESERVE; else if (!strcmp(optarg, "noreserve")) mdio.md_options &= ~MD_RESERVE; else if (!strcmp(optarg, "verify")) mdio.md_options |= MD_VERIFY; else if (!strcmp(optarg, "noverify")) mdio.md_options &= ~MD_VERIFY; else errx(1, "unknown option: %s", optarg); break; case 'S': mdio.md_sectorsize = strtoul(optarg, &p, 0); break; case 's': if (sflag != NULL) errx(1, "-s can be passed only once"); sflag = optarg; mdio.md_mediasize = (off_t)strtoumax(optarg, &p, 0); if (p == NULL || *p == '\0') mdio.md_mediasize *= DEV_BSIZE; else if (*p == 'b' || *p == 'B') ; /* do nothing */ else if (*p == 'k' || *p == 'K') mdio.md_mediasize <<= 10; else if (*p == 'm' || *p == 'M') mdio.md_mediasize <<= 20; else if (*p == 'g' || *p == 'G') mdio.md_mediasize <<= 30; else if (*p == 't' || *p == 'T') { mdio.md_mediasize <<= 30; mdio.md_mediasize <<= 10; } else if (*p == 'p' || *p == 'P') { mdio.md_mediasize <<= 30; mdio.md_mediasize <<= 20; } else errx(1, "unknown suffix on -s argument"); break; case 'u': if (!strncmp(optarg, _PATH_DEV, sizeof(_PATH_DEV) - 1)) optarg += sizeof(_PATH_DEV) - 1; if (!strncmp(optarg, MD_NAME, sizeof(MD_NAME) - 1)) optarg += sizeof(MD_NAME) - 1; uflag = optarg; break; case 'v': vflag = OPT_VERBOSE; break; case 'x': mdio.md_fwsectors = strtoul(optarg, &p, 0); break; case 'y': mdio.md_fwheads = strtoul(optarg, &p, 0); break; case 'L': strlcpy(mdio.md_label, optarg, PATH_MAX); break; default: usage(); } } argc -= optind; argv += optind; if (action == UNSET) action = ATTACH; if (action == ATTACH) { if (tflag == NULL) { /* * Try to infer the type based on other arguments. */ if (fflag != NULL || argc > 0) { /* Imply ``-t vnode'' */ mdio.md_type = MD_VNODE; mdio.md_options |= MD_CLUSTER | MD_AUTOUNIT | MD_COMPRESS; } else if (sflag != NULL) { /* Imply ``-t swap'' */ mdio.md_type = MD_SWAP; mdio.md_options |= MD_CLUSTER | MD_AUTOUNIT | MD_COMPRESS; } else errx(1, "unable to determine type"); } if ((fflag != NULL || argc > 0) && mdio.md_type != MD_VNODE) errx(1, "only -t vnode can be used with file name"); if (mdio.md_type == MD_VNODE) { if (fflag != NULL) { if (argc != 0) usage(); md_set_file(fflag); } else { if (argc != 1) usage(); md_set_file(*argv); } if ((mdio.md_options & MD_READONLY) == 0 && access(mdio.md_file, W_OK) < 0 && (errno == EACCES || errno == EPERM || errno == EROFS)) { warnx("WARNING: opening backing store: %s " "readonly", mdio.md_file); mdio.md_options |= MD_READONLY; } } if ((mdio.md_type == MD_MALLOC || mdio.md_type == MD_SWAP || mdio.md_type == MD_NULL) && sflag == NULL) errx(1, "must specify -s for -t malloc, -t swap, " "or -t null"); if (mdio.md_type == MD_VNODE && mdio.md_file[0] == '\0') errx(1, "must specify -f for -t vnode"); } else { if (mdio.md_sectorsize != 0) errx(1, "-S can only be used with -a"); if (action != RESIZE && sflag != NULL) errx(1, "-s can only be used with -a and -r"); if (mdio.md_fwsectors != 0) errx(1, "-x can only be used with -a"); if (mdio.md_fwheads != 0) errx(1, "-y can only be used with -a"); if (fflag != NULL && action != LIST) errx(1, "-f can only be used with -a and -l"); if (tflag != NULL) errx(1, "-t can only be used with -a"); if (argc > 0) errx(1, "file can only be used with -a"); if ((action != DETACH && action != RESIZE) && (mdio.md_options & ~MD_AUTOUNIT) != 0) errx(1, "-o can only be used with -a, -d, and -r"); if (action == DETACH && (mdio.md_options & ~(MD_FORCE | MD_AUTOUNIT)) != 0) errx(1, "only -o [no]force can be used with -d"); if (action == RESIZE && (mdio.md_options & ~(MD_FORCE | MD_RESERVE | MD_AUTOUNIT)) != 0) errx(1, "only -o [no]force and -o [no]reserve can be used with -r"); } if (action == RESIZE && sflag == NULL) errx(1, "must specify -s for -r"); if (action != LIST && vflag == OPT_VERBOSE) errx(1, "-v can only be used with -l"); if (uflag != NULL) { mdio.md_unit = strtoul(uflag, &p, 0); if (mdio.md_unit == (unsigned)ULONG_MAX || *p != '\0') errx(1, "bad unit: %s", uflag); mdio.md_options &= ~MD_AUTOUNIT; } mdio.md_version = MDIOVERSION; if (!kld_isloaded("g_md") && kld_load("geom_md") == -1) err(1, "failed to load geom_md module"); fd = open(_PATH_DEV MDCTL_NAME, O_RDWR, 0); if (fd < 0) err(1, "open(%s%s)", _PATH_DEV, MDCTL_NAME); if (action == ATTACH) { i = ioctl(fd, MDIOCATTACH, &mdio); if (i < 0) err(1, "ioctl(%s%s)", _PATH_DEV, MDCTL_NAME); if (mdio.md_options & MD_AUTOUNIT) printf("%s%d\n", nflag ? "" : MD_NAME, mdio.md_unit); } else if (action == DETACH) { if (mdio.md_options & MD_AUTOUNIT) errx(1, "-d requires -u"); i = ioctl(fd, MDIOCDETACH, &mdio); if (i < 0) err(1, "ioctl(%s%s)", _PATH_DEV, MDCTL_NAME); } else if (action == RESIZE) { if (mdio.md_options & MD_AUTOUNIT) errx(1, "-r requires -u"); i = ioctl(fd, MDIOCRESIZE, &mdio); if (i < 0) err(1, "ioctl(%s%s)", _PATH_DEV, MDCTL_NAME); } else if (action == LIST) { if (mdio.md_options & MD_AUTOUNIT) { /* * Listing all devices. This is why we pass NULL * together with OPT_LIST. */ return (md_list(NULL, OPT_LIST | vflag, fflag)); } else return (md_query(uflag, vflag, fflag)); } else usage(); close(fd); return (0); } static void md_set_file(const char *fn) { struct stat sb; int fd; if (realpath(fn, mdio.md_file) == NULL) err(1, "could not find full path for %s", fn); fd = open(mdio.md_file, O_RDONLY); if (fd < 0) err(1, "could not open %s", fn); if (fstat(fd, &sb) == -1) err(1, "could not stat %s", fn); if (!S_ISREG(sb.st_mode)) errx(1, "%s is not a regular file", fn); if (mdio.md_mediasize == 0) mdio.md_mediasize = sb.st_size; close(fd); } /* * Lists md(4) disks. Is used also as a query routine, since it handles XML * interface. 'units' can be NULL for listing memory disks. It might be * comma-separated string containing md(4) disk names. 'opt' distinguished * between list and query mode. */ static int md_list(const char *units, int opt, const char *fflag) { struct gmesh gm; struct gprovider *pp; struct gconf *gc; struct gident *gid; struct devstat *gsp; struct ggeom *gg; struct gclass *gcl; void *sq; int retcode, ffound, ufound; char *length; const char *type, *file, *label; type = file = length = NULL; retcode = geom_gettree(&gm); if (retcode != 0) return (-1); retcode = geom_stats_open(); if (retcode != 0) return (-1); sq = geom_stats_snapshot_get(); if (sq == NULL) return (-1); ffound = ufound = 0; while ((gsp = geom_stats_snapshot_next(sq)) != NULL) { gid = geom_lookupid(&gm, gsp->id); if (gid == NULL) continue; if (gid->lg_what == ISPROVIDER) { pp = gid->lg_ptr; gg = pp->lg_geom; gcl = gg->lg_class; if (strcmp(gcl->lg_name, CLASS_NAME_MD) != 0) continue; if ((opt & OPT_UNIT) && (units != NULL)) { retcode = md_find(units, pp->lg_name); if (retcode != 1) continue; else ufound = 1; } gc = &pp->lg_config; type = geom_config_get(gc, "type"); if (type != NULL && (strcmp(type, "vnode") == 0 || strcmp(type, "preload") == 0)) { file = geom_config_get(gc, "file"); if (fflag != NULL && strcmp(fflag, file) != 0) continue; else ffound = 1; } else if (fflag != NULL) continue; if (nflag && strncmp(pp->lg_name, MD_NAME, 2) == 0) printf("%s", pp->lg_name + 2); else printf("%s", pp->lg_name); if (opt & OPT_VERBOSE || ((opt & OPT_UNIT) && fflag == NULL)) { length = geom_config_get(gc, "length"); printf("\t%s\t", type); if (length != NULL) md_prthumanval(length); if (file == NULL) file = "-"; printf("\t%s", file); file = NULL; label = geom_config_get(gc, "label"); if (label == NULL) label = ""; printf("\t%s", label); } opt |= OPT_DONE; if ((opt & OPT_LIST) && !(opt & OPT_VERBOSE)) printf(" "); else printf("\n"); } } if ((opt & OPT_LIST) && (opt & OPT_DONE) && !(opt & OPT_VERBOSE)) printf("\n"); /* XXX: Check if it's enough to clean everything. */ geom_stats_snapshot_free(sq); if (opt & OPT_UNIT) { if (((fflag == NULL) && ufound) || ((fflag == NULL) && (units != NULL) && ufound) || ((fflag != NULL) && ffound) || ((fflag != NULL) && (units != NULL) && ufound && ffound)) return (0); } else if (opt & OPT_LIST) { if ((fflag == NULL) || ((fflag != NULL) && ffound)) return (0); } return (-1); } /* * Returns value of 'name' from gconfig structure. */ static char * geom_config_get(struct gconf *g, const char *name) { struct gconfig *gce; LIST_FOREACH(gce, g, lg_config) { if (strcmp(gce->lg_name, name) == 0) return (gce->lg_val); } return (NULL); } /* * List is comma separated list of MD disks. name is a * device name we look for. Returns 1 if found and 0 * otherwise. */ static int md_find(const char *list, const char *name) { int ret; char num[PATH_MAX]; char *ptr, *p, *u; ret = 0; ptr = strdup(list); if (ptr == NULL) return (-1); for (p = ptr; (u = strsep(&p, ",")) != NULL;) { if (strncmp(u, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) u += sizeof(_PATH_DEV) - 1; /* Just in case user specified number instead of full name */ snprintf(num, sizeof(num), "%s%s", MD_NAME, u); if (strcmp(u, name) == 0 || strcmp(num, name) == 0) { ret = 1; break; } } free(ptr); return (ret); } static void md_prthumanval(char *length) { char buf[6]; uintmax_t bytes; char *endptr; errno = 0; bytes = strtoumax(length, &endptr, 10); if (errno != 0 || *endptr != '\0' || bytes > INT64_MAX) return; humanize_number(buf, sizeof(buf), (int64_t)bytes, "", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); (void)printf("%6s", buf); } static int md_query(const char *name, const int opt, const char *fflag) { return (md_list(name, opt | OPT_UNIT, fflag)); } diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index 08fb4b0c6574..92d93f0f0f1d 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -1,2156 +1,2170 @@ /*- * SPDX-License-Identifier: (Beerware AND BSD-3-Clause) * * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ * */ /*- * The following functions are based on the vn(4) driver: mdstart_swap(), * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(), * and as such under the following copyright: * * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah Hdr: vn.c 1.13 94/04/02 * * from: @(#)vn.c 8.6 (Berkeley) 4/1/94 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03 */ #include "opt_rootdevname.h" #include "opt_geom.h" #include "opt_md.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #define MD_MODVER 1 #define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ #define MD_EXITING 0x20000 /* Worker thread is exiting. */ #define MD_PROVIDERGONE 0x40000 /* Safe to free the softc */ #ifndef MD_NSECT #define MD_NSECT (10000 * 2) #endif struct md_req { unsigned md_unit; /* unit number */ enum md_types md_type; /* type of disk */ off_t md_mediasize; /* size of disk in bytes */ unsigned md_sectorsize; /* sectorsize */ unsigned md_options; /* options */ int md_fwheads; /* firmware heads */ int md_fwsectors; /* firmware sectors */ char *md_file; /* pathname of file to mount */ enum uio_seg md_file_seg; /* location of md_file */ char *md_label; /* label of the device (userspace) */ int *md_units; /* pointer to units array (kernel) */ size_t md_units_nitems; /* items in md_units array */ }; #ifdef COMPAT_FREEBSD32 struct md_ioctl32 { unsigned md_version; unsigned md_unit; enum md_types md_type; uint32_t md_file; off_t md_mediasize; unsigned md_sectorsize; unsigned md_options; uint64_t md_base; int md_fwheads; int md_fwsectors; uint32_t md_label; int md_pad[MDNPAD]; } __attribute__((__packed__)); CTASSERT((sizeof(struct md_ioctl32)) == 436); #define MDIOCATTACH_32 _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32) #define MDIOCDETACH_32 _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32) #define MDIOCQUERY_32 _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32) #define MDIOCRESIZE_32 _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32) #endif /* COMPAT_FREEBSD32 */ static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk"); static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors"); static int md_debug; SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "Enable md(4) debug messages"); static int md_malloc_wait; SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, "Allow malloc to wait for memory allocations"); #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE) #define MD_ROOT_FSTYPE "ufs" #endif #if defined(MD_ROOT) /* * Preloaded image gets put here. */ #if defined(MD_ROOT_SIZE) /* * We put the mfs_root symbol into the oldmfs section of the kernel object file. * Applications that patch the object with the image can determine * the size looking at the oldmfs section size within the kernel. */ u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs"))); const int mfs_root_size = sizeof(mfs_root); #elif defined(MD_ROOT_MEM) /* MD region already mapped in the memory */ u_char *mfs_root; int mfs_root_size; #else extern volatile u_char __weak_symbol mfs_root; extern volatile u_char __weak_symbol mfs_root_end; #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root)) #endif #endif static g_init_t g_md_init; static g_fini_t g_md_fini; static g_start_t g_md_start; static g_access_t g_md_access; static void g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp); static g_provgone_t g_md_providergone; static struct cdev *status_dev = NULL; static struct sx md_sx; static struct unrhdr *md_uh; static d_ioctl_t mdctlioctl; static struct cdevsw mdctl_cdevsw = { .d_version = D_VERSION, .d_ioctl = mdctlioctl, .d_name = MD_NAME, }; struct g_class g_md_class = { .name = "MD", .version = G_VERSION, .init = g_md_init, .fini = g_md_fini, .start = g_md_start, .access = g_md_access, .dumpconf = g_md_dumpconf, .providergone = g_md_providergone, }; DECLARE_GEOM_CLASS(g_md_class, g_md); static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list); #define NINDIR (PAGE_SIZE / sizeof(uintptr_t)) #define NMASK (NINDIR-1) static int nshift; static uma_zone_t md_pbuf_zone; struct indir { uintptr_t *array; u_int total; u_int used; u_int shift; }; struct md_s { int unit; LIST_ENTRY(md_s) list; struct bio_queue_head bio_queue; struct mtx queue_mtx; struct cdev *dev; enum md_types type; off_t mediasize; unsigned sectorsize; unsigned opencount; unsigned fwheads; unsigned fwsectors; char ident[32]; unsigned flags; char name[20]; struct proc *procp; struct g_geom *gp; struct g_provider *pp; int (*start)(struct md_s *sc, struct bio *bp); struct devstat *devstat; + bool candelete; /* MD_MALLOC related fields */ struct indir *indir; uma_zone_t uma; /* MD_PRELOAD related fields */ u_char *pl_ptr; size_t pl_len; /* MD_VNODE related fields */ struct vnode *vnode; char file[PATH_MAX]; char label[PATH_MAX]; struct ucred *cred; /* MD_SWAP related fields */ vm_object_t object; }; static struct indir * new_indir(u_int shift) { struct indir *ip; ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); if (ip == NULL) return (NULL); ip->array = malloc(sizeof(uintptr_t) * NINDIR, M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); if (ip->array == NULL) { free(ip, M_MD); return (NULL); } ip->total = NINDIR; ip->shift = shift; return (ip); } static void del_indir(struct indir *ip) { free(ip->array, M_MDSECT); free(ip, M_MD); } static void destroy_indir(struct md_s *sc, struct indir *ip) { int i; for (i = 0; i < NINDIR; i++) { if (!ip->array[i]) continue; if (ip->shift) destroy_indir(sc, (struct indir*)(ip->array[i])); else if (ip->array[i] > 255) uma_zfree(sc->uma, (void *)(ip->array[i])); } del_indir(ip); } /* * This function does the math and allocates the top level "indir" structure * for a device of "size" sectors. */ static struct indir * dimension(off_t size) { off_t rcnt; struct indir *ip; int layer; rcnt = size; layer = 0; while (rcnt > NINDIR) { rcnt /= NINDIR; layer++; } /* * XXX: the top layer is probably not fully populated, so we allocate * too much space for ip->array in here. */ ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO); ip->array = malloc(sizeof(uintptr_t) * NINDIR, M_MDSECT, M_WAITOK | M_ZERO); ip->total = NINDIR; ip->shift = layer * nshift; return (ip); } /* * Read a given sector */ static uintptr_t s_read(struct indir *ip, off_t offset) { struct indir *cip; int idx; uintptr_t up; if (md_debug > 1) printf("s_read(%jd)\n", (intmax_t)offset); up = 0; for (cip = ip; cip != NULL;) { if (cip->shift) { idx = (offset >> cip->shift) & NMASK; up = cip->array[idx]; cip = (struct indir *)up; continue; } idx = offset & NMASK; return (cip->array[idx]); } return (0); } /* * Write a given sector, prune the tree if the value is 0 */ static int s_write(struct indir *ip, off_t offset, uintptr_t ptr) { struct indir *cip, *lip[10]; int idx, li; uintptr_t up; if (md_debug > 1) printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr); up = 0; li = 0; cip = ip; for (;;) { lip[li++] = cip; if (cip->shift) { idx = (offset >> cip->shift) & NMASK; up = cip->array[idx]; if (up != 0) { cip = (struct indir *)up; continue; } /* Allocate branch */ cip->array[idx] = (uintptr_t)new_indir(cip->shift - nshift); if (cip->array[idx] == 0) return (ENOSPC); cip->used++; up = cip->array[idx]; cip = (struct indir *)up; continue; } /* leafnode */ idx = offset & NMASK; up = cip->array[idx]; if (up != 0) cip->used--; cip->array[idx] = ptr; if (ptr != 0) cip->used++; break; } if (cip->used != 0 || li == 1) return (0); li--; while (cip->used == 0 && cip != ip) { li--; idx = (offset >> lip[li]->shift) & NMASK; up = lip[li]->array[idx]; KASSERT(up == (uintptr_t)cip, ("md screwed up")); del_indir(cip); lip[li]->array[idx] = 0; lip[li]->used--; cip = lip[li]; } return (0); } static int g_md_access(struct g_provider *pp, int r, int w, int e) { struct md_s *sc; sc = pp->geom->softc; if (sc == NULL) { if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; if ((sc->flags & MD_READONLY) != 0 && w > 0) return (EROFS); if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { sc->opencount = 1; } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { sc->opencount = 0; } return (0); } static void g_md_start(struct bio *bp) { struct md_s *sc; sc = bp->bio_to->geom->softc; if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) { devstat_start_transaction_bio(sc->devstat, bp); } mtx_lock(&sc->queue_mtx); bioq_disksort(&sc->bio_queue, bp); wakeup(sc); mtx_unlock(&sc->queue_mtx); } #define MD_MALLOC_MOVE_ZERO 1 #define MD_MALLOC_MOVE_FILL 2 #define MD_MALLOC_MOVE_READ 3 #define MD_MALLOC_MOVE_WRITE 4 #define MD_MALLOC_MOVE_CMP 5 static int md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize, void *ptr, u_char fill, int op) { struct sf_buf *sf; vm_page_t m, *mp1; char *p, first; off_t *uc; unsigned n; int error, i, ma_offs1, sz, first_read; m = NULL; error = 0; sf = NULL; /* if (op == MD_MALLOC_MOVE_CMP) { gcc */ first = 0; first_read = 0; uc = ptr; mp1 = *mp; ma_offs1 = *ma_offs; /* } */ sched_pin(); for (n = sectorsize; n != 0; n -= sz) { sz = imin(PAGE_SIZE - *ma_offs, n); if (m != **mp) { if (sf != NULL) sf_buf_free(sf); m = **mp; sf = sf_buf_alloc(m, SFB_CPUPRIVATE | (md_malloc_wait ? 0 : SFB_NOWAIT)); if (sf == NULL) { error = ENOMEM; break; } } p = (char *)sf_buf_kva(sf) + *ma_offs; switch (op) { case MD_MALLOC_MOVE_ZERO: bzero(p, sz); break; case MD_MALLOC_MOVE_FILL: memset(p, fill, sz); break; case MD_MALLOC_MOVE_READ: bcopy(ptr, p, sz); cpu_flush_dcache(p, sz); break; case MD_MALLOC_MOVE_WRITE: bcopy(p, ptr, sz); break; case MD_MALLOC_MOVE_CMP: for (i = 0; i < sz; i++, p++) { if (!first_read) { *uc = (u_char)*p; first = *p; first_read = 1; } else if (*p != first) { error = EDOOFUS; break; } } break; default: KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op)); break; } if (error != 0) break; *ma_offs += sz; *ma_offs %= PAGE_SIZE; if (*ma_offs == 0) (*mp)++; ptr = (char *)ptr + sz; } if (sf != NULL) sf_buf_free(sf); sched_unpin(); if (op == MD_MALLOC_MOVE_CMP && error != 0) { *mp = mp1; *ma_offs = ma_offs1; } return (error); } static int md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs, unsigned len, void *ptr, u_char fill, int op) { bus_dma_segment_t *vlist; uint8_t *p, *end, first; off_t *uc; int ma_offs, seg_len; vlist = *pvlist; ma_offs = *pma_offs; uc = ptr; for (; len != 0; len -= seg_len) { seg_len = imin(vlist->ds_len - ma_offs, len); p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs; switch (op) { case MD_MALLOC_MOVE_ZERO: bzero(p, seg_len); break; case MD_MALLOC_MOVE_FILL: memset(p, fill, seg_len); break; case MD_MALLOC_MOVE_READ: bcopy(ptr, p, seg_len); cpu_flush_dcache(p, seg_len); break; case MD_MALLOC_MOVE_WRITE: bcopy(p, ptr, seg_len); break; case MD_MALLOC_MOVE_CMP: end = p + seg_len; first = *uc = *p; /* Confirm all following bytes match the first */ while (++p < end) { if (*p != first) return (EDOOFUS); } break; default: KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op)); break; } ma_offs += seg_len; if (ma_offs == vlist->ds_len) { ma_offs = 0; vlist++; } ptr = (uint8_t *)ptr + seg_len; } *pvlist = vlist; *pma_offs = ma_offs; return (0); } static int mdstart_malloc(struct md_s *sc, struct bio *bp) { u_char *dst; vm_page_t *m; bus_dma_segment_t *vlist; int i, error, error1, ma_offs, notmapped; off_t secno, nsec, uc; uintptr_t sp, osp; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: return (EOPNOTSUPP); } notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; vlist = (bp->bio_flags & BIO_VLIST) != 0 ? (bus_dma_segment_t *)bp->bio_data : NULL; if (notmapped) { m = bp->bio_ma; ma_offs = bp->bio_ma_offset; dst = NULL; KASSERT(vlist == NULL, ("vlists cannot be unmapped")); } else if (vlist != NULL) { ma_offs = bp->bio_ma_offset; dst = NULL; } else { dst = bp->bio_data; } nsec = bp->bio_length / sc->sectorsize; secno = bp->bio_offset / sc->sectorsize; error = 0; while (nsec--) { osp = s_read(sc->indir, secno); if (bp->bio_cmd == BIO_DELETE) { if (osp != 0) error = s_write(sc->indir, secno, 0); } else if (bp->bio_cmd == BIO_READ) { if (osp == 0) { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, NULL, 0, MD_MALLOC_MOVE_ZERO); } else if (vlist != NULL) { error = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, NULL, 0, MD_MALLOC_MOVE_ZERO); } else bzero(dst, sc->sectorsize); } else if (osp <= 255) { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, NULL, osp, MD_MALLOC_MOVE_FILL); } else if (vlist != NULL) { error = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, NULL, osp, MD_MALLOC_MOVE_FILL); } else memset(dst, osp, sc->sectorsize); } else { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_READ); } else if (vlist != NULL) { error = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_READ); } else { bcopy((void *)osp, dst, sc->sectorsize); cpu_flush_dcache(dst, sc->sectorsize); } } osp = 0; } else if (bp->bio_cmd == BIO_WRITE) { if (sc->flags & MD_COMPRESS) { if (notmapped) { error1 = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, &uc, 0, MD_MALLOC_MOVE_CMP); i = error1 == 0 ? sc->sectorsize : 0; } else if (vlist != NULL) { error1 = md_malloc_move_vlist(&vlist, &ma_offs, sc->sectorsize, &uc, 0, MD_MALLOC_MOVE_CMP); i = error1 == 0 ? sc->sectorsize : 0; } else { uc = dst[0]; for (i = 1; i < sc->sectorsize; i++) { if (dst[i] != uc) break; } } } else { i = 0; uc = 0; } if (i == sc->sectorsize) { if (osp != uc) error = s_write(sc->indir, secno, uc); } else { if (osp <= 255) { sp = (uintptr_t)uma_zalloc(sc->uma, md_malloc_wait ? M_WAITOK : M_NOWAIT); if (sp == 0) { error = ENOSPC; break; } if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)sp, 0, MD_MALLOC_MOVE_WRITE); } else if (vlist != NULL) { error = md_malloc_move_vlist( &vlist, &ma_offs, sc->sectorsize, (void *)sp, 0, MD_MALLOC_MOVE_WRITE); } else { bcopy(dst, (void *)sp, sc->sectorsize); } error = s_write(sc->indir, secno, sp); } else { if (notmapped) { error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_WRITE); } else if (vlist != NULL) { error = md_malloc_move_vlist( &vlist, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_WRITE); } else { bcopy(dst, (void *)osp, sc->sectorsize); } osp = 0; } } } else { error = EOPNOTSUPP; } if (osp > 255) uma_zfree(sc->uma, (void*)osp); if (error != 0) break; secno++; if (!notmapped && vlist == NULL) dst += sc->sectorsize; } bp->bio_resid = 0; return (error); } static void mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len) { off_t seg_len; while (offset >= vlist->ds_len) { offset -= vlist->ds_len; vlist++; } while (len != 0) { seg_len = omin(len, vlist->ds_len - offset); bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset), seg_len); offset = 0; src = (uint8_t *)src + seg_len; len -= seg_len; vlist++; } } static void mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len) { off_t seg_len; while (offset >= vlist->ds_len) { offset -= vlist->ds_len; vlist++; } while (len != 0) { seg_len = omin(len, vlist->ds_len - offset); bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst, seg_len); offset = 0; dst = (uint8_t *)dst + seg_len; len -= seg_len; vlist++; } } static int mdstart_preload(struct md_s *sc, struct bio *bp) { uint8_t *p; p = sc->pl_ptr + bp->bio_offset; switch (bp->bio_cmd) { case BIO_READ: if ((bp->bio_flags & BIO_VLIST) != 0) { mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data, bp->bio_ma_offset, bp->bio_length); } else { bcopy(p, bp->bio_data, bp->bio_length); } cpu_flush_dcache(bp->bio_data, bp->bio_length); break; case BIO_WRITE: if ((bp->bio_flags & BIO_VLIST) != 0) { mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data, bp->bio_ma_offset, p, bp->bio_length); } else { bcopy(bp->bio_data, p, bp->bio_length); } break; } bp->bio_resid = 0; return (0); } static int mdstart_vnode(struct md_s *sc, struct bio *bp) { int error; struct uio auio; struct iovec aiov; struct iovec *piov; struct mount *mp; struct vnode *vp; struct buf *pb; bus_dma_segment_t *vlist; struct thread *td; off_t iolen, iostart, off, len; int ma_offs, npages; switch (bp->bio_cmd) { case BIO_READ: auio.uio_rw = UIO_READ; break; case BIO_WRITE: auio.uio_rw = UIO_WRITE; break; - case BIO_DELETE: case BIO_FLUSH: break; + case BIO_DELETE: + if (sc->candelete) + break; + /* FALLTHROUGH */ default: return (EOPNOTSUPP); } td = curthread; vp = sc->vnode; pb = NULL; piov = NULL; ma_offs = bp->bio_ma_offset; off = bp->bio_offset; len = bp->bio_length; /* * VNODE I/O * * If an error occurs, we set BIO_ERROR but we do not set * B_INVAL because (for a write anyway), the buffer is * still valid. */ if (bp->bio_cmd == BIO_FLUSH) { (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } else if (bp->bio_cmd == BIO_DELETE) { error = vn_deallocate(vp, &off, &len, 0, sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred, NOCRED); bp->bio_resid = len; return (error); } auio.uio_offset = (vm_ooffset_t)bp->bio_offset; auio.uio_resid = bp->bio_length; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; if ((bp->bio_flags & BIO_VLIST) != 0) { piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK); auio.uio_iov = piov; vlist = (bus_dma_segment_t *)bp->bio_data; while (len > 0) { piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr + ma_offs); piov->iov_len = vlist->ds_len - ma_offs; if (piov->iov_len > len) piov->iov_len = len; len -= piov->iov_len; ma_offs = 0; vlist++; piov++; } auio.uio_iovcnt = piov - auio.uio_iov; piov = auio.uio_iov; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pb = uma_zalloc(md_pbuf_zone, M_WAITOK); MPASS((pb->b_flags & B_MAXPHYS) != 0); bp->bio_resid = len; unmapped_step: npages = atop(min(maxphys, round_page(len + (ma_offs & PAGE_MASK)))); iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len); KASSERT(iolen > 0, ("zero iolen")); pmap_qenter((vm_offset_t)pb->b_data, &bp->bio_ma[atop(ma_offs)], npages); aiov.iov_base = (void *)((vm_offset_t)pb->b_data + (ma_offs & PAGE_MASK)); aiov.iov_len = iolen; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iolen; } else { aiov.iov_base = bp->bio_data; aiov.iov_len = bp->bio_length; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; } iostart = auio.uio_offset; if (auio.uio_rw == UIO_READ) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(vp, &auio, 0, sc->cred); VOP_UNLOCK(vp); } else { (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred); VOP_UNLOCK(vp); vn_finished_write(mp); if (error == 0) sc->flags &= ~MD_VERIFY; } /* When MD_CACHE is set, try to avoid double-caching the data. */ if (error == 0 && (sc->flags & MD_CACHE) == 0) VOP_ADVISE(vp, iostart, auio.uio_offset - 1, POSIX_FADV_DONTNEED); if (pb != NULL) { pmap_qremove((vm_offset_t)pb->b_data, npages); if (error == 0) { len -= iolen; bp->bio_resid -= iolen; ma_offs += iolen; if (len > 0) goto unmapped_step; } uma_zfree(md_pbuf_zone, pb); } else { bp->bio_resid = auio.uio_resid; } free(piov, M_MD); return (error); } static int mdstart_swap(struct md_s *sc, struct bio *bp) { vm_page_t m; u_char *p; vm_pindex_t i, lastp; bus_dma_segment_t *vlist; int rv, ma_offs, offs, len, lastend; switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: break; default: return (EOPNOTSUPP); } p = bp->bio_data; ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ? bp->bio_ma_offset : 0; vlist = (bp->bio_flags & BIO_VLIST) != 0 ? (bus_dma_segment_t *)bp->bio_data : NULL; /* * offs is the offset at which to start operating on the * next (ie, first) page. lastp is the last page on * which we're going to operate. lastend is the ending * position within that last page (ie, PAGE_SIZE if * we're operating on complete aligned pages). */ offs = bp->bio_offset % PAGE_SIZE; lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; rv = VM_PAGER_OK; vm_object_pip_add(sc->object, 1); for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM); if (bp->bio_cmd == BIO_READ) { if (vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, NULL, NULL); if (rv == VM_PAGER_ERROR) { VM_OBJECT_WLOCK(sc->object); vm_page_free(m); VM_OBJECT_WUNLOCK(sc->object); break; } else if (rv == VM_PAGER_FAIL) { /* * Pager does not have the page. Zero * the allocated page, and mark it as * valid. Do not set dirty, the page * can be recreated if thrown out. */ pmap_zero_page(m); vm_page_valid(m); } if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pmap_copy_pages(&m, offs, bp->bio_ma, ma_offs, len); } else if ((bp->bio_flags & BIO_VLIST) != 0) { physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs, vlist, ma_offs, len); cpu_flush_dcache(p, len); } else { physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); cpu_flush_dcache(p, len); } } else if (bp->bio_cmd == BIO_WRITE) { if (len == PAGE_SIZE || vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, NULL, NULL); if (rv == VM_PAGER_ERROR) { VM_OBJECT_WLOCK(sc->object); vm_page_free(m); VM_OBJECT_WUNLOCK(sc->object); break; } else if (rv == VM_PAGER_FAIL) pmap_zero_page(m); if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pmap_copy_pages(bp->bio_ma, ma_offs, &m, offs, len); } else if ((bp->bio_flags & BIO_VLIST) != 0) { physcopyin_vlist(vlist, ma_offs, VM_PAGE_TO_PHYS(m) + offs, len); } else { physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); } vm_page_valid(m); vm_page_set_dirty(m); } else if (bp->bio_cmd == BIO_DELETE) { if (len == PAGE_SIZE || vm_page_all_valid(m)) rv = VM_PAGER_OK; else rv = vm_pager_get_pages(sc->object, &m, 1, NULL, NULL); VM_OBJECT_WLOCK(sc->object); if (rv == VM_PAGER_ERROR) { vm_page_free(m); VM_OBJECT_WUNLOCK(sc->object); break; } else if (rv == VM_PAGER_FAIL) { vm_page_free(m); m = NULL; } else { /* Page is valid. */ if (len != PAGE_SIZE) { pmap_zero_page_area(m, offs, len); vm_page_set_dirty(m); } else { vm_pager_page_unswapped(m); vm_page_free(m); m = NULL; } } VM_OBJECT_WUNLOCK(sc->object); } if (m != NULL) { /* * The page may be deactivated prior to setting * PGA_REFERENCED, but in this case it will be * reactivated by the page daemon. */ if (vm_page_active(m)) vm_page_reference(m); else vm_page_activate(m); vm_page_xunbusy(m); } /* Actions on further pages start at offset 0 */ p += PAGE_SIZE - offs; offs = 0; ma_offs += len; } vm_object_pip_wakeup(sc->object); return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); } static int mdstart_null(struct md_s *sc, struct bio *bp) { switch (bp->bio_cmd) { case BIO_READ: bzero(bp->bio_data, bp->bio_length); cpu_flush_dcache(bp->bio_data, bp->bio_length); break; case BIO_WRITE: break; } bp->bio_resid = 0; return (0); } static void md_handleattr(struct md_s *sc, struct bio *bp) { if (sc->fwsectors && sc->fwheads && (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) != 0 || g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads) != 0)) return; - if (g_handleattr_int(bp, "GEOM::candelete", 1) != 0) + if (g_handleattr_int(bp, "GEOM::candelete", sc->candelete) != 0) return; if (sc->ident[0] != '\0' && g_handleattr_str(bp, "GEOM::ident", sc->ident) != 0) return; if (g_handleattr_int(bp, "MNT::verified", (sc->flags & MD_VERIFY) != 0)) return; g_io_deliver(bp, EOPNOTSUPP); } static void md_kthread(void *arg) { struct md_s *sc; struct bio *bp; int error; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); if (sc->type == MD_VNODE) curthread->td_pflags |= TDP_NORUNNINGBUF; for (;;) { mtx_lock(&sc->queue_mtx); if (sc->flags & MD_SHUTDOWN) { sc->flags |= MD_EXITING; mtx_unlock(&sc->queue_mtx); kproc_exit(0); } bp = bioq_takefirst(&sc->bio_queue); if (!bp) { msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0); continue; } mtx_unlock(&sc->queue_mtx); if (bp->bio_cmd == BIO_GETATTR) { md_handleattr(sc, bp); } else { error = sc->start(sc, bp); if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { /* * Devstat uses (bio_bcount, bio_resid) for * determining the length of the completed part * of the i/o. g_io_deliver() will translate * from bio_completed to that, but it also * destroys the bio so we must do our own * translation. */ bp->bio_bcount = bp->bio_length; devstat_end_transaction_bio(sc->devstat, bp); } bp->bio_completed = bp->bio_length - bp->bio_resid; g_io_deliver(bp, error); } } } static struct md_s * mdfind(int unit) { struct md_s *sc; LIST_FOREACH(sc, &md_softc_list, list) { if (sc->unit == unit) break; } return (sc); } static struct md_s * mdnew(int unit, int *errp, enum md_types type) { struct md_s *sc; int error; *errp = 0; if (unit == -1) unit = alloc_unr(md_uh); else unit = alloc_unr_specific(md_uh, unit); if (unit == -1) { *errp = EBUSY; return (NULL); } sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO); sc->type = type; bioq_init(&sc->bio_queue); mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF); sc->unit = unit; sprintf(sc->name, "md%d", unit); LIST_INSERT_HEAD(&md_softc_list, sc, list); error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name); if (error == 0) return (sc); LIST_REMOVE(sc, list); mtx_destroy(&sc->queue_mtx); free_unr(md_uh, sc->unit); free(sc, M_MD); *errp = error; return (NULL); } static void mdinit(struct md_s *sc) { struct g_geom *gp; struct g_provider *pp; g_topology_lock(); gp = g_new_geomf(&g_md_class, "md%d", sc->unit); gp->softc = sc; pp = g_new_providerf(gp, "md%d", sc->unit); devstat_remove_entry(pp->stat); pp->stat = NULL; pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; pp->mediasize = sc->mediasize; pp->sectorsize = sc->sectorsize; switch (sc->type) { case MD_MALLOC: case MD_VNODE: case MD_SWAP: pp->flags |= G_PF_ACCEPT_UNMAPPED; break; case MD_PRELOAD: case MD_NULL: break; } sc->gp = gp; sc->pp = pp; sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); sc->devstat->id = pp; g_error_provider(pp, 0); g_topology_unlock(); } static int mdcreate_malloc(struct md_s *sc, struct md_req *mdr) { uintptr_t sp; int error; off_t u; error = 0; if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE)) return (EINVAL); if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize)) return (EINVAL); /* Compression doesn't make sense if we have reserved space */ if (mdr->md_options & MD_RESERVE) mdr->md_options &= ~MD_COMPRESS; if (mdr->md_fwsectors != 0) sc->fwsectors = mdr->md_fwsectors; if (mdr->md_fwheads != 0) sc->fwheads = mdr->md_fwheads; sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE); sc->indir = dimension(sc->mediasize / sc->sectorsize); sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL, 0x1ff, 0); if (mdr->md_options & MD_RESERVE) { off_t nsectors; nsectors = sc->mediasize / sc->sectorsize; for (u = 0; u < nsectors; u++) { sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); if (sp != 0) error = s_write(sc->indir, u, sp); else error = ENOMEM; if (error != 0) break; } } return (error); } static int mdsetcred(struct md_s *sc, struct ucred *cred) { char *tmpbuf; int error = 0; /* * Set credits in our softc */ if (sc->cred) crfree(sc->cred); sc->cred = crhold(cred); /* * Horrible kludge to establish credentials for NFS XXX. */ if (sc->vnode) { struct uio auio; struct iovec aiov; tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK); bzero(&auio, sizeof(auio)); aiov.iov_base = tmpbuf; aiov.iov_len = sc->sectorsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = aiov.iov_len; vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(sc->vnode, &auio, 0, sc->cred); VOP_UNLOCK(sc->vnode); free(tmpbuf, M_TEMP); } return (error); } static int mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td) { struct vattr vattr; struct nameidata nd; char *fname; int error, flags; + long v; fname = mdr->md_file; if (mdr->md_file_seg == UIO_USERSPACE) { error = copyinstr(fname, sc->file, sizeof(sc->file), NULL); if (error != 0) return (error); } else if (mdr->md_file_seg == UIO_SYSSPACE) strlcpy(sc->file, fname, sizeof(sc->file)); else return (EDOOFUS); /* * If the user specified that this is a read only device, don't * set the FWRITE mask before trying to open the backing store. */ flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \ | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0); NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td); error = vn_open(&nd, &flags, 0, NULL); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = EINVAL; goto bad; } error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred); if (error != 0) goto bad; + if ((mdr->md_options & MD_MUSTDEALLOC) != 0) { + error = VOP_PATHCONF(nd.ni_vp, _PC_DEALLOC_PRESENT, &v); + if (error != 0) + goto bad; + if (v == 0) + sc->candelete = false; + } if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) { vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(nd.ni_vp)) { /* Forced unmount. */ error = EBADF; goto bad; } } nd.ni_vp->v_vflag |= VV_MD; VOP_UNLOCK(nd.ni_vp); if (mdr->md_fwsectors != 0) sc->fwsectors = mdr->md_fwsectors; if (mdr->md_fwheads != 0) sc->fwheads = mdr->md_fwheads; snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju", (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid); sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE | MD_VERIFY); if (!(flags & FWRITE)) sc->flags |= MD_READONLY; sc->vnode = nd.ni_vp; error = mdsetcred(sc, td->td_ucred); if (error != 0) { sc->vnode = NULL; vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); nd.ni_vp->v_vflag &= ~VV_MD; goto bad; } return (0); bad: VOP_UNLOCK(nd.ni_vp); (void)vn_close(nd.ni_vp, flags, td->td_ucred, td); return (error); } static void g_md_providergone(struct g_provider *pp) { struct md_s *sc = pp->geom->softc; mtx_lock(&sc->queue_mtx); sc->flags |= MD_PROVIDERGONE; wakeup(&sc->flags); mtx_unlock(&sc->queue_mtx); } static int mddestroy(struct md_s *sc, struct thread *td) { if (sc->gp) { g_topology_lock(); g_wither_geom(sc->gp, ENXIO); g_topology_unlock(); mtx_lock(&sc->queue_mtx); while (!(sc->flags & MD_PROVIDERGONE)) msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0); mtx_unlock(&sc->queue_mtx); } if (sc->devstat) { devstat_remove_entry(sc->devstat); sc->devstat = NULL; } mtx_lock(&sc->queue_mtx); sc->flags |= MD_SHUTDOWN; wakeup(sc); while (!(sc->flags & MD_EXITING)) msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10); mtx_unlock(&sc->queue_mtx); mtx_destroy(&sc->queue_mtx); if (sc->vnode != NULL) { vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY); sc->vnode->v_vflag &= ~VV_MD; VOP_UNLOCK(sc->vnode); (void)vn_close(sc->vnode, sc->flags & MD_READONLY ? FREAD : (FREAD|FWRITE), sc->cred, td); } if (sc->cred != NULL) crfree(sc->cred); if (sc->object != NULL) vm_object_deallocate(sc->object); if (sc->indir) destroy_indir(sc, sc->indir); if (sc->uma) uma_zdestroy(sc->uma); LIST_REMOVE(sc, list); free_unr(md_uh, sc->unit); free(sc, M_MD); return (0); } static int mdresize(struct md_s *sc, struct md_req *mdr) { int error, res; vm_pindex_t oldpages, newpages; switch (sc->type) { case MD_VNODE: case MD_NULL: break; case MD_SWAP: if (mdr->md_mediasize <= 0 || (mdr->md_mediasize % PAGE_SIZE) != 0) return (EDOM); oldpages = OFF_TO_IDX(sc->mediasize); newpages = OFF_TO_IDX(mdr->md_mediasize); if (newpages < oldpages) { VM_OBJECT_WLOCK(sc->object); vm_object_page_remove(sc->object, newpages, 0, 0); swap_release_by_cred(IDX_TO_OFF(oldpages - newpages), sc->cred); sc->object->charge = IDX_TO_OFF(newpages); sc->object->size = newpages; VM_OBJECT_WUNLOCK(sc->object); } else if (newpages > oldpages) { res = swap_reserve_by_cred(IDX_TO_OFF(newpages - oldpages), sc->cred); if (!res) return (ENOMEM); if ((mdr->md_options & MD_RESERVE) || (sc->flags & MD_RESERVE)) { error = swap_pager_reserve(sc->object, oldpages, newpages - oldpages); if (error < 0) { swap_release_by_cred( IDX_TO_OFF(newpages - oldpages), sc->cred); return (EDOM); } } VM_OBJECT_WLOCK(sc->object); sc->object->charge = IDX_TO_OFF(newpages); sc->object->size = newpages; VM_OBJECT_WUNLOCK(sc->object); } break; default: return (EOPNOTSUPP); } sc->mediasize = mdr->md_mediasize; g_topology_lock(); g_resize_provider(sc->pp, sc->mediasize); g_topology_unlock(); return (0); } static int mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td) { vm_ooffset_t npage; int error; /* * Range check. Disallow negative sizes and sizes not being * multiple of page size. */ if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) return (EDOM); /* * Allocate an OBJT_SWAP object. * * Note the truncation. */ if ((mdr->md_options & MD_VERIFY) != 0) return (EINVAL); npage = mdr->md_mediasize / PAGE_SIZE; if (mdr->md_fwsectors != 0) sc->fwsectors = mdr->md_fwsectors; if (mdr->md_fwheads != 0) sc->fwheads = mdr->md_fwheads; sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage, VM_PROT_DEFAULT, 0, td->td_ucred); if (sc->object == NULL) return (ENOMEM); sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE); if (mdr->md_options & MD_RESERVE) { if (swap_pager_reserve(sc->object, 0, npage) < 0) { error = EDOM; goto finish; } } error = mdsetcred(sc, td->td_ucred); finish: if (error != 0) { vm_object_deallocate(sc->object); sc->object = NULL; } return (error); } static int mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td) { /* * Range check. Disallow negative sizes and sizes not being * multiple of page size. */ if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0) return (EDOM); return (0); } static int kern_mdattach_locked(struct thread *td, struct md_req *mdr) { struct md_s *sc; unsigned sectsize; int error, i; sx_assert(&md_sx, SA_XLOCKED); switch (mdr->md_type) { case MD_MALLOC: case MD_PRELOAD: case MD_VNODE: case MD_SWAP: case MD_NULL: break; default: return (EINVAL); } if (mdr->md_sectorsize == 0) sectsize = DEV_BSIZE; else sectsize = mdr->md_sectorsize; if (sectsize > maxphys || mdr->md_mediasize < sectsize) return (EINVAL); if (mdr->md_options & MD_AUTOUNIT) sc = mdnew(-1, &error, mdr->md_type); else { if (mdr->md_unit > INT_MAX) return (EINVAL); sc = mdnew(mdr->md_unit, &error, mdr->md_type); } if (sc == NULL) return (error); if (mdr->md_label != NULL) error = copyinstr(mdr->md_label, sc->label, sizeof(sc->label), NULL); if (error != 0) goto err_after_new; if (mdr->md_options & MD_AUTOUNIT) mdr->md_unit = sc->unit; sc->mediasize = mdr->md_mediasize; sc->sectorsize = sectsize; + sc->candelete = true; error = EDOOFUS; switch (sc->type) { case MD_MALLOC: sc->start = mdstart_malloc; error = mdcreate_malloc(sc, mdr); break; case MD_PRELOAD: /* * We disallow attaching preloaded memory disks via * ioctl. Preloaded memory disks are automatically * attached in g_md_init(). */ error = EOPNOTSUPP; break; case MD_VNODE: sc->start = mdstart_vnode; error = mdcreate_vnode(sc, mdr, td); break; case MD_SWAP: sc->start = mdstart_swap; error = mdcreate_swap(sc, mdr, td); break; case MD_NULL: sc->start = mdstart_null; error = mdcreate_null(sc, mdr, td); break; } err_after_new: if (error != 0) { mddestroy(sc, td); return (error); } /* Prune off any residual fractional sector */ i = sc->mediasize % sc->sectorsize; sc->mediasize -= i; mdinit(sc); return (0); } static int kern_mdattach(struct thread *td, struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mdattach_locked(td, mdr); sx_xunlock(&md_sx); return (error); } static int kern_mddetach_locked(struct thread *td, struct md_req *mdr) { struct md_s *sc; sx_assert(&md_sx, SA_XLOCKED); if (mdr->md_mediasize != 0 || (mdr->md_options & ~MD_FORCE) != 0) return (EINVAL); sc = mdfind(mdr->md_unit); if (sc == NULL) return (ENOENT); if (sc->opencount != 0 && !(sc->flags & MD_FORCE) && !(mdr->md_options & MD_FORCE)) return (EBUSY); return (mddestroy(sc, td)); } static int kern_mddetach(struct thread *td, struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mddetach_locked(td, mdr); sx_xunlock(&md_sx); return (error); } static int kern_mdresize_locked(struct md_req *mdr) { struct md_s *sc; sx_assert(&md_sx, SA_XLOCKED); if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0) return (EINVAL); sc = mdfind(mdr->md_unit); if (sc == NULL) return (ENOENT); if (mdr->md_mediasize < sc->sectorsize) return (EINVAL); mdr->md_mediasize -= mdr->md_mediasize % sc->sectorsize; if (mdr->md_mediasize < sc->mediasize && !(sc->flags & MD_FORCE) && !(mdr->md_options & MD_FORCE)) return (EBUSY); return (mdresize(sc, mdr)); } static int kern_mdresize(struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mdresize_locked(mdr); sx_xunlock(&md_sx); return (error); } static int kern_mdquery_locked(struct md_req *mdr) { struct md_s *sc; int error; sx_assert(&md_sx, SA_XLOCKED); sc = mdfind(mdr->md_unit); if (sc == NULL) return (ENOENT); mdr->md_type = sc->type; mdr->md_options = sc->flags; mdr->md_mediasize = sc->mediasize; mdr->md_sectorsize = sc->sectorsize; error = 0; if (mdr->md_label != NULL) { error = copyout(sc->label, mdr->md_label, strlen(sc->label) + 1); if (error != 0) return (error); } if (sc->type == MD_VNODE || (sc->type == MD_PRELOAD && mdr->md_file != NULL)) error = copyout(sc->file, mdr->md_file, strlen(sc->file) + 1); return (error); } static int kern_mdquery(struct md_req *mdr) { int error; sx_xlock(&md_sx); error = kern_mdquery_locked(mdr); sx_xunlock(&md_sx); return (error); } /* Copy members that are not userspace pointers. */ #define MD_IOCTL2REQ(mdio, mdr) do { \ (mdr)->md_unit = (mdio)->md_unit; \ (mdr)->md_type = (mdio)->md_type; \ (mdr)->md_mediasize = (mdio)->md_mediasize; \ (mdr)->md_sectorsize = (mdio)->md_sectorsize; \ (mdr)->md_options = (mdio)->md_options; \ (mdr)->md_fwheads = (mdio)->md_fwheads; \ (mdr)->md_fwsectors = (mdio)->md_fwsectors; \ (mdr)->md_units = &(mdio)->md_pad[0]; \ (mdr)->md_units_nitems = nitems((mdio)->md_pad); \ } while(0) /* Copy members that might have been updated */ #define MD_REQ2IOCTL(mdr, mdio) do { \ (mdio)->md_unit = (mdr)->md_unit; \ (mdio)->md_type = (mdr)->md_type; \ (mdio)->md_mediasize = (mdr)->md_mediasize; \ (mdio)->md_sectorsize = (mdr)->md_sectorsize; \ (mdio)->md_options = (mdr)->md_options; \ (mdio)->md_fwheads = (mdr)->md_fwheads; \ (mdio)->md_fwsectors = (mdr)->md_fwsectors; \ } while(0) static int mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct md_req mdr; int error; if (md_debug) printf("mdctlioctl(%s %lx %p %x %p)\n", devtoname(dev), cmd, addr, flags, td); bzero(&mdr, sizeof(mdr)); switch (cmd) { case MDIOCATTACH: case MDIOCDETACH: case MDIOCRESIZE: case MDIOCQUERY: { struct md_ioctl *mdio = (struct md_ioctl *)addr; if (mdio->md_version != MDIOVERSION) return (EINVAL); MD_IOCTL2REQ(mdio, &mdr); mdr.md_file = mdio->md_file; mdr.md_file_seg = UIO_USERSPACE; /* If the file is adjacent to the md_ioctl it's in kernel. */ if ((void *)mdio->md_file == (void *)(mdio + 1)) mdr.md_file_seg = UIO_SYSSPACE; mdr.md_label = mdio->md_label; break; } #ifdef COMPAT_FREEBSD32 case MDIOCATTACH_32: case MDIOCDETACH_32: case MDIOCRESIZE_32: case MDIOCQUERY_32: { struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr; if (mdio->md_version != MDIOVERSION) return (EINVAL); MD_IOCTL2REQ(mdio, &mdr); mdr.md_file = (void *)(uintptr_t)mdio->md_file; mdr.md_file_seg = UIO_USERSPACE; mdr.md_label = (void *)(uintptr_t)mdio->md_label; break; } #endif default: /* Fall through to handler switch. */ break; } error = 0; switch (cmd) { case MDIOCATTACH: #ifdef COMPAT_FREEBSD32 case MDIOCATTACH_32: #endif error = kern_mdattach(td, &mdr); break; case MDIOCDETACH: #ifdef COMPAT_FREEBSD32 case MDIOCDETACH_32: #endif error = kern_mddetach(td, &mdr); break; case MDIOCRESIZE: #ifdef COMPAT_FREEBSD32 case MDIOCRESIZE_32: #endif error = kern_mdresize(&mdr); break; case MDIOCQUERY: #ifdef COMPAT_FREEBSD32 case MDIOCQUERY_32: #endif error = kern_mdquery(&mdr); break; default: error = ENOIOCTL; } switch (cmd) { case MDIOCATTACH: case MDIOCQUERY: { struct md_ioctl *mdio = (struct md_ioctl *)addr; MD_REQ2IOCTL(&mdr, mdio); break; } #ifdef COMPAT_FREEBSD32 case MDIOCATTACH_32: case MDIOCQUERY_32: { struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr; MD_REQ2IOCTL(&mdr, mdio); break; } #endif default: /* Other commands to not alter mdr. */ break; } return (error); } static void md_preloaded(u_char *image, size_t length, const char *name) { struct md_s *sc; int error; sc = mdnew(-1, &error, MD_PRELOAD); if (sc == NULL) return; sc->mediasize = length; sc->sectorsize = DEV_BSIZE; sc->pl_ptr = image; sc->pl_len = length; sc->start = mdstart_preload; if (name != NULL) strlcpy(sc->file, name, sizeof(sc->file)); #ifdef MD_ROOT if (sc->unit == 0) { #ifndef ROOTDEVNAME rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0"; #endif #ifdef MD_ROOT_READONLY sc->flags |= MD_READONLY; #endif } #endif mdinit(sc); if (name != NULL) { printf("%s%d: Preloaded image <%s> %zd bytes at %p\n", MD_NAME, sc->unit, name, length, image); } else { printf("%s%d: Embedded image %zd bytes at %p\n", MD_NAME, sc->unit, length, image); } } static void g_md_init(struct g_class *mp __unused) { caddr_t mod; u_char *ptr, *name, *type; unsigned len; int i; /* figure out log2(NINDIR) */ for (i = NINDIR, nshift = -1; i; nshift++) i >>= 1; mod = NULL; sx_init(&md_sx, "MD config lock"); g_topology_unlock(); md_uh = new_unrhdr(0, INT_MAX, NULL); #ifdef MD_ROOT if (mfs_root_size != 0) { sx_xlock(&md_sx); #ifdef MD_ROOT_MEM md_preloaded(mfs_root, mfs_root_size, NULL); #else md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size, NULL); #endif sx_xunlock(&md_sx); } #endif /* XXX: are preload_* static or do they need Giant ? */ while ((mod = preload_search_next_name(mod)) != NULL) { name = (char *)preload_search_info(mod, MODINFO_NAME); if (name == NULL) continue; type = (char *)preload_search_info(mod, MODINFO_TYPE); if (type == NULL) continue; if (strcmp(type, "md_image") && strcmp(type, "mfs_root")) continue; ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); if (ptr != NULL && len != 0) { sx_xlock(&md_sx); md_preloaded(ptr, len, name); sx_xunlock(&md_sx); } } md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10); status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL, 0600, MDCTL_NAME); g_topology_lock(); } static void g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct md_s *mp; char *type; mp = gp->softc; if (mp == NULL) return; switch (mp->type) { case MD_MALLOC: type = "malloc"; break; case MD_PRELOAD: type = "preload"; break; case MD_VNODE: type = "vnode"; break; case MD_SWAP: type = "swap"; break; case MD_NULL: type = "null"; break; default: type = "unknown"; break; } if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " u %d", mp->unit); sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize); sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads); sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors); sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize); sbuf_printf(sb, " t %s", type); if ((mp->type == MD_VNODE && mp->vnode != NULL) || (mp->type == MD_PRELOAD && mp->file[0] != '\0')) sbuf_printf(sb, " file %s", mp->file); sbuf_printf(sb, " label %s", mp->label); } else { sbuf_printf(sb, "%s%d\n", indent, mp->unit); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->sectorsize); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->fwheads); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->fwsectors); if (mp->ident[0] != '\0') { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", mp->ident); sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t) mp->mediasize); sbuf_printf(sb, "%s%s\n", indent, (mp->flags & MD_COMPRESS) == 0 ? "off": "on"); sbuf_printf(sb, "%s%s\n", indent, (mp->flags & MD_READONLY) == 0 ? "read-write": "read-only"); sbuf_printf(sb, "%s%s\n", indent, type); if ((mp->type == MD_VNODE && mp->vnode != NULL) || (mp->type == MD_PRELOAD && mp->file[0] != '\0')) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", mp->file); sbuf_printf(sb, "\n"); } if (mp->type == MD_VNODE) sbuf_printf(sb, "%s%s\n", indent, (mp->flags & MD_CACHE) == 0 ? "off": "on"); sbuf_printf(sb, "%s\n"); } } } static void g_md_fini(struct g_class *mp __unused) { sx_destroy(&md_sx); if (status_dev != NULL) destroy_dev(status_dev); uma_zdestroy(md_pbuf_zone); delete_unrhdr(md_uh); } diff --git a/sys/sys/mdioctl.h b/sys/sys/mdioctl.h index 9e0b0c995341..97464c11149b 100644 --- a/sys/sys/mdioctl.h +++ b/sys/sys/mdioctl.h @@ -1,96 +1,97 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: fdioctl.h 1.1 90/07/09$ * * @(#)vnioctl.h 8.1 (Berkeley) 6/10/93 * * From: src/sys/sys/vnioctl.h,v 1.4 * * $FreeBSD$ */ #ifndef _SYS_MDIOCTL_H_ #define _SYS_MDIOCTL_H_ enum md_types {MD_MALLOC, MD_PRELOAD, MD_VNODE, MD_SWAP, MD_NULL}; /* * Ioctl definitions for memory disk pseudo-device. */ #define MDNPAD 96 struct md_ioctl { unsigned md_version; /* Structure layout version */ unsigned md_unit; /* unit number */ enum md_types md_type ; /* type of disk */ char *md_file; /* pathname of file to mount */ off_t md_mediasize; /* size of disk in bytes */ unsigned md_sectorsize; /* sectorsize */ unsigned md_options; /* options */ u_int64_t md_base; /* base address */ int md_fwheads; /* firmware heads */ int md_fwsectors; /* firmware sectors */ char *md_label; /* label of the device */ int md_pad[MDNPAD]; /* padding */ }; #define MD_NAME "md" #define MDCTL_NAME "mdctl" #define MDIOVERSION 0 /* * Before you can use a unit, it must be configured with MDIOCSET. * The configuration persists across opens and closes of the device; * an MDIOCCLR must be used to reset a configuration. An attempt to * MDIOCSET an already active unit will return EBUSY. */ #define MDIOCATTACH _IOWR('m', 0, struct md_ioctl) /* attach disk */ #define MDIOCDETACH _IOWR('m', 1, struct md_ioctl) /* detach disk */ #define MDIOCQUERY _IOWR('m', 2, struct md_ioctl) /* query status */ #define MDIOCRESIZE _IOWR('m', 4, struct md_ioctl) /* resize disk */ #define MD_CLUSTER 0x01 /* Don't cluster */ #define MD_RESERVE 0x02 /* Pre-reserve swap */ #define MD_AUTOUNIT 0x04 /* Assign next free unit */ #define MD_READONLY 0x08 /* Readonly mode */ #define MD_COMPRESS 0x10 /* Compression mode */ #define MD_FORCE 0x20 /* Don't try to prevent foot-shooting */ #define MD_ASYNC 0x40 /* Asynchronous mode */ #define MD_VERIFY 0x80 /* Open file with O_VERIFY (vnode only) */ #define MD_CACHE 0x100 /* Cache vnode data */ +#define MD_MUSTDEALLOC 0x200 /* BIO_DELETE only if dealloc is available */ #endif /* _SYS_MDIOCTL_H_*/