Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c (revision 186196) +++ head/sys/kern/vfs_mount.c (revision 186197) @@ -1,2323 +1,2306 @@ /*- * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_rootdevname.h" #include "opt_mac.h" #define ROOTNAME "root_device" #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, int fsflags, void *fsdata); static int vfs_mountroot_ask(void); static int vfs_mountroot_try(const char *mountfrom); static void free_mntarg(struct mntarg *ma); static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; }; /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store */ /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; /* * The root specifiers we will try if RB_CDROM is specified. */ static char *cdrom_rootdevnames[] = { "cd9660:cd0", "cd9660:acd0", NULL }; /* legacy find-root code */ char *rootdevnames[2] = {NULL, NULL}; #ifndef ROOTDEVNAME # define ROOTDEVNAME NULL #endif static const char *ctrootdevname = ROOTDEVNAME; /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); #ifdef INVARIANTS else if (opt->len != 0) panic("%s: mount option with NULL value but length != 0", __func__); #endif free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ static int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused; unsigned int i, iovcnt; int error, namelen, optlen; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX This function will keep a "nofoo" option in the * new options if there is no matching "foo" option * to be cancelled in the old options. This is a bug * if the option's canonical name is "foo". E.g., "noro" * shouldn't end up in the mount point's active options, * but it can. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *new; TAILQ_FOREACH(opt, opts, link) { /* * Check that this option hasn't been redefined * nor cancelled with a "no" mount option. */ opt2 = TAILQ_FIRST(toopts); while (opt2 != NULL) { if (strcmp(opt2->name, opt->name) == 0) goto next; if (strncmp(opt2->name, "no", 2) == 0 && strcmp(opt2->name + 2, opt->name) == 0) { vfs_freeopt(toopts, opt2); goto next; } opt2 = TAILQ_NEXT(opt2, link); } /* We want this option, duplicate it. */ new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK); strcpy(new->name, opt->name); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else { new->value = NULL; } new->len = opt->len; TAILQ_INSERT_TAIL(toopts, new, link); next: continue; } } /* * Mount a filesystem. */ int nmount(td, uap) struct thread *td; struct nmount_args /* { struct iovec *iovp; unsigned int iovcnt; int flags; } */ *uap; { struct uio *auio; struct iovec *iov; unsigned int i; int error; u_int iovcnt; AUDIT_ARG(fflags, uap->flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try(). */ uap->flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) return (EINVAL); error = copyinuio(uap->iovp, iovcnt, &auio); if (error) return (error); iov = auio->uio_iov; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > MMAXOPTIONLEN) { free(auio, M_IOV); return (EINVAL); } iov++; } error = vfs_donmount(td, uap->flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_mtx); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; mp->mnt_ref = 0; (void) vfs_busy(mp, MBF_NOWAIT); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; vfsp->vfc_refcount++; /* XXX Unlocked */ mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { MNT_ILOCK(mp); + mp->mnt_kern_flag |= MNTK_REFEXPIRE; + if (mp->mnt_kern_flag & MNTK_MWAIT) { + mp->mnt_kern_flag &= ~MNTK_MWAIT; + wakeup(mp); + } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); - if (mp->mnt_writeopcount > 0) { - printf("Waiting for mount point write ops\n"); - while (mp->mnt_writeopcount > 0) { - mp->mnt_kern_flag |= MNTK_SUSPEND; - msleep(&mp->mnt_writeopcount, - MNT_MTX(mp), - PZERO, "mntdestroy2", 0); - } - printf("mount point write ops completed\n"); - } - if (mp->mnt_secondary_writes > 0) { - printf("Waiting for mount point secondary write ops\n"); - while (mp->mnt_secondary_writes > 0) { - mp->mnt_kern_flag |= MNTK_SUSPEND; - msleep(&mp->mnt_secondary_writes, - MNT_MTX(mp), - PZERO, "mntdestroy3", 0); - } - printf("mount point secondary write ops completed\n"); - } - MNT_IUNLOCK(mp); + KASSERT(mp->mnt_ref == 0, + ("%s: invalid refcount in the drain path @ %s:%d", __func__, + __FILE__, __LINE__)); + if (mp->mnt_writeopcount != 0) + panic("vfs_mount_destroy: nonzero writeopcount"); + if (mp->mnt_secondary_writes != 0) + panic("vfs_mount_destroy: nonzero secondary_writes"); mp->mnt_vfc->vfc_refcount--; if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vprint("", vp); panic("unmount: dangling vnode"); } - MNT_ILOCK(mp); - if (mp->mnt_kern_flag & MNTK_MWAIT) - wakeup(mp); - if (mp->mnt_writeopcount != 0) - panic("vfs_mount_destroy: nonzero writeopcount"); - if (mp->mnt_secondary_writes != 0) - panic("vfs_mount_destroy: nonzero secondary_writes"); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); - mp->mnt_writeopcount = -1000; - mp->mnt_nvnodelistsize = -1000; - mp->mnt_secondary_writes = -1000; + if (mp->mnt_lockref != 0) + panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *noro_opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; int has_rw, has_noro; errmsg = fspath = NULL; errmsg_len = has_noro = has_rw = fspathlen = 0; errmsg_pos = -1; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) { fsflags &= ~MNT_RDONLY; has_noro = 1; } else if (strcmp(opt->name, "rw") == 0) { fsflags &= ~MNT_RDONLY; has_rw = 1; } else if (strcmp(opt->name, "ro") == 0) fsflags |= MNT_RDONLY; else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; } /* * If "rw" was specified as a mount option, and we * are trying to update a mount-point from "ro" to "rw", * we need a mount option "noro", since in vfs_mergeopts(), * "noro" will cancel "ro", but "rw" will not do anything. */ if (has_rw && !has_noro) { noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); noro_opt->name = strdup("noro", M_MOUNT); noro_opt->value = NULL; noro_opt->len = 0; TAILQ_INSERT_TAIL(optlist, noro_opt, link); } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) { error = ENAMETOOLONG; goto bail; } mtx_lock(&Giant); error = vfs_domount(td, fstype, fspath, fsflags, optlist); mtx_unlock(&Giant); bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (error != 0) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int mount(td, uap) struct thread *td; struct mount_args /* { char *type; char *path; int flags; caddr_t data; } */ *uap; { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; int error; AUDIT_ARG(fflags, uap->flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try(). */ uap->flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG(text, fstype); mtx_lock(&Giant); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) { mtx_unlock(&Giant); return (ENOENT); } if (vfsp->vfc_vfsops->vfs_cmount == NULL) { mtx_unlock(&Giant); return (EOPNOTSUPP); } ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec"); error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td); mtx_unlock(&Giant); return (error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ int fsflags, /* Flags common to all filesystems. */ void *fsdata /* Options local to the filesystem. */ ) { struct vnode *vp; struct mount *mp; struct vfsconf *vfsp; struct oexport_args oexport; struct export_args export; int error, flag = 0; struct vattr va; struct nameidata nd; mtx_assert(&Giant, MA_OWNED); /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL)) return (EPERM); } /* * Get vnode to be covered */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (fsflags & MNT_UPDATE) { if ((vp->v_vflag & VV_ROOT) == 0) { vput(vp); return (EINVAL); } mp = vp->v_mount; MNT_ILOCK(mp); flag = mp->mnt_flag; /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ if ((fsflags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { MNT_IUNLOCK(mp); vput(vp); return (EOPNOTSUPP); /* Needs translation */ } MNT_IUNLOCK(mp); /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); MNT_ILOCK(mp); mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS); MNT_IUNLOCK(mp); VOP_UNLOCK(vp, 0); mp->mnt_optnew = fsdata; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); } else { /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error) { vput(vp); return (error); } if (va.va_uid != td->td_ucred->cr_uid) { error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0); if (error) { vput(vp); return (error); } } error = vinvalbuf(vp, V_SAVE, 0, 0); if (error != 0) { vput(vp); return (error); } if (vp->v_type != VDIR) { vput(vp); return (ENOTDIR); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); /* * Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); VOP_UNLOCK(vp, 0); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = fsdata; } /* * Set the mount level flags. */ MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) | (fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS | MNT_RDONLY)); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, td); /* * Process the export option only if we are * updating mount options. */ if (!error && (fsflags & MNT_UPDATE)) { if (vfs_copyopt(mp->mnt_optnew, "export", &export, sizeof(export)) == 0) error = vfs_export(mp, &export); else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport, sizeof(oexport)) == 0) { export.ex_flags = oexport.ex_flags; export.ex_root = oexport.ex_root; export.ex_anon = oexport.ex_anon; export.ex_addr = oexport.ex_addr; export.ex_addrlen = oexport.ex_addrlen; export.ex_mask = oexport.ex_mask; export.ex_masklen = oexport.ex_masklen; export.ex_indexfile = oexport.ex_indexfile; export.ex_numsecflavors = 0; error = vfs_export(mp, &export); } } if (!error) { if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; (void)VFS_STATFS(mp, &mp->mnt_stat, td); } /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if (mp->mnt_flag & MNT_UPDATE) { MNT_ILOCK(mp); if (error) mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); else mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { if (mp->mnt_syncer == NULL) error = vfs_allocate_syncvnode(mp); } else { if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); mp->mnt_syncer = NULL; } vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Put the new filesystem on the mount list after root. */ cache_purge(vp); if (!error) { struct vnode *newdp; VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td)) panic("mount: lost mount"); mountcheckdirs(vp, newdp); vput(newdp); VOP_UNLOCK(vp, 0); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp); if (error) vrele(vp); } else { VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vfs_unbusy(mp); vfs_mount_destroy(mp); vput(vp); } return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int unmount(td, uap) struct thread *td; register struct unmount_args /* { char *path; int flags; } */ *uap; { struct mount *mp; char *pathbuf; int error, id0, id1; if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1); mtx_lock(&Giant); if (uap->flags & MNT_BYFSID) { /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { mtx_unlock(&Giant); free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) break; } mtx_unlock(&mountlist_mtx); } else { mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) break; } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ mtx_unlock(&Giant); return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { mtx_unlock(&Giant); return (EINVAL); } error = dounmount(mp, uap->flags, td); mtx_unlock(&Giant); return (error); } /* * Do the actual filesystem unmount. */ int dounmount(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct vnode *coveredvp, *fsrootvp; int error; int async_flag; int mnt_gen_r; mtx_assert(&Giant, MA_OWNED); if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); vdrop(coveredvp); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp, 0); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error) { if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (error); } MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ; /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) mp->mnt_kern_flag |= MNTK_UNMOUNTF; error = 0; if (mp->mnt_lockref) { if ((flags & MNT_FORCE) == 0) { mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ | MNTK_UNMOUNTF); if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (EBUSY); } mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_msync(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); cache_purgevfs(mp); /* remove cache entries for this file sys */ if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); /* * For forced unmounts, move process cdir/rdir refs on the fs root * vnode to the covered vnode. For non-forced unmounts we want * such references to cause an EBUSY error. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) || (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, td); } vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ; if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) { MNT_IUNLOCK(mp); (void) vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vput(coveredvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); vfs_mount_destroy(mp); return (0); } /* * --------------------------------------------------------------------- * Mounting of root filesystem * */ struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(&root_holds); static int root_mount_complete; /* * Hold root mount. */ struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&mountlist_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&mountlist_mtx); return (h); } /* * Release root mount. */ void root_mount_rel(struct root_hold_token *h) { mtx_lock(&mountlist_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&mountlist_mtx); free(h, M_DEVBUF); } /* * Wait for all subsystems to release root mount. */ static void root_mount_prepare(void) { struct root_hold_token *h; for (;;) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&mountlist_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&mountlist_mtx); break; } printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold", hz); } } /* * Root was mounted, share the good news. */ static void root_mount_done(void) { /* * Use a mutex to prevent the wakeup being missed and waiting for * an extra 1 second sleep. */ mtx_lock(&mountlist_mtx); root_mount_complete = 1; wakeup(&root_mount_complete); mtx_unlock(&mountlist_mtx); } /* * Return true if root is already mounted. */ int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } /* * Wait until root is mounted. */ void root_mount_wait(void) { /* * Panic on an obvious deadlock - the function can't be called from * a thread which is doing the whole SYSINIT stuff. */ KASSERT(curthread->td_proc->p_pid != 0, ("root_mount_wait: cannot be called from the swapper thread")); mtx_lock(&mountlist_mtx); while (!root_mount_complete) { msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait", hz); } mtx_unlock(&mountlist_mtx); } static void set_rootvnode(struct thread *td) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td)) panic("Cannot find root vnode"); p = td->td_proc; FILEDESC_XLOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_XUNLOCK(p->p_fd); VOP_UNLOCK(rootvnode, 0); EVENTHANDLER_INVOKE(mountroot); } /* * Mount /devfs as our root filesystem, but do not put it on the mountlist * yet. Create a /dev -> / symlink so that absolute pathnames will lookup. */ static void devfs_first(void) { struct thread *td = curthread; struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp = NULL; int error; vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return; mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp, td); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); set_rootvnode(td); error = kern_symlink(td, "/", "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); } /* * Surgically move our devfs to be mounted on /dev. */ static void devfs_fixup(struct thread *td) { struct nameidata nd; int error; struct vnode *vp, *dvp; struct mount *mp; /* Remove our devfs mount from the mountlist and purge the cache */ mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mp); VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td); VI_LOCK(dvp); dvp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(dvp); dvp->v_mountedhere = NULL; /* Set up the real rootvnode, and purge the cache */ TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL; set_rootvnode(td); cache_purgevfs(rootvnode->v_mount); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (error) { printf("Lookup of /dev for devfs, error: %d\n", error); return; } NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type != VDIR) { vput(vp); } error = vinvalbuf(vp, V_SAVE, 0, 0); if (error) { vput(vp); } cache_purge(vp); mp->mnt_vnodecovered = vp; vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); VOP_UNLOCK(vp, 0); vput(dvp); vfs_unbusy(mp); /* Unlink the no longer needed /dev/dev -> / symlink */ kern_unlink(td, "/dev/dev", UIO_SYSSPACE); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * Find and mount the root filesystem */ void vfs_mountroot(void) { char *cp; int error, i, asked = 0; root_mount_prepare(); mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); devfs_first(); /* * We are booted with instructions to prompt for the root filesystem. */ if (boothowto & RB_ASKNAME) { if (!vfs_mountroot_ask()) goto mounted; asked = 1; } /* * The root filesystem information is compiled in, and we are * booted with instructions to use it. */ if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) { if (!vfs_mountroot_try(ctrootdevname)) goto mounted; ctrootdevname = NULL; } /* * We've been given the generic "use CDROM as root" flag. This is * necessary because one media may be used in many different * devices, so we need to search for them. */ if (boothowto & RB_CDROM) { for (i = 0; cdrom_rootdevnames[i] != NULL; i++) { if (!vfs_mountroot_try(cdrom_rootdevnames[i])) goto mounted; } } /* * Try to use the value read by the loader from /etc/fstab, or * supplied via some other means. This is the preferred * mechanism. */ cp = getenv("vfs.root.mountfrom"); if (cp != NULL) { error = vfs_mountroot_try(cp); freeenv(cp); if (!error) goto mounted; } /* * Try values that may have been computed by code during boot */ if (!vfs_mountroot_try(rootdevnames[0])) goto mounted; if (!vfs_mountroot_try(rootdevnames[1])) goto mounted; /* * If we (still) have a compiled-in default, try it. */ if (ctrootdevname != NULL) if (!vfs_mountroot_try(ctrootdevname)) goto mounted; /* * Everything so far has failed, prompt on the console if we haven't * already tried that. */ if (!asked) if (!vfs_mountroot_ask()) goto mounted; panic("Root mount failed, startup aborted."); mounted: root_mount_done(); } /* * Mount (mountfrom) as the root filesystem. */ static int vfs_mountroot_try(const char *mountfrom) { struct mount *mp; char *vfsname, *path; time_t timebase; int error; char patt[32]; vfsname = NULL; path = NULL; mp = NULL; error = EINVAL; if (mountfrom == NULL) return (error); /* don't complain */ printf("Trying to mount root from %s\n", mountfrom); /* parse vfs name and path */ vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK); path = malloc(MNAMELEN, M_MOUNT, M_WAITOK); vfsname[0] = path[0] = 0; sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN); if (sscanf(mountfrom, patt, vfsname, path) < 1) goto out; if (path[0] == '\0') strcpy(path, ROOTNAME); error = kernel_vmount( MNT_RDONLY | MNT_ROOTFS, "fstype", vfsname, "fspath", "/", "from", path, NULL); if (error == 0) { /* * We mount devfs prior to mounting the / FS, so the first * entry will typically be devfs. */ mp = TAILQ_FIRST(&mountlist); KASSERT(mp != NULL, ("%s: mountlist is empty", __func__)); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Typically devfs has no time stamp and the only other FS * is the actual / FS. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; do { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } while (mp != NULL); inittodr(timebase); devfs_fixup(curthread); } out: free(path, M_MOUNT); free(vfsname, M_MOUNT); return (error); } /* * --------------------------------------------------------------------- * Interactive root filesystem selection code. */ static int vfs_mountroot_ask(void) { char name[128]; for(;;) { printf("\nManual root filesystem specification:\n"); printf(" : Mount using filesystem \n"); #if defined(__amd64__) || defined(__i386__) || defined(__ia64__) printf(" eg. ufs:da0s1a\n"); #else printf(" eg. ufs:/dev/da0a\n"); #endif printf(" ? List valid disk boot devices\n"); printf(" Abort manual input\n"); printf("\nmountroot> "); gets(name, sizeof(name), 1); if (name[0] == '\0') return (1); if (name[0] == '?') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (!vfs_mountroot_try(name)) return (0); } } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); printf("%s\n", errmsg); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); } } } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(opts, name, buf, len) struct vfsoptlist *opts; const char *name; void **buf; int *len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; int i; if (opts == NULL) return (-1); i = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) return (i); ++i; } return (-1); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(opts, name, dest, len) struct vfsoptlist *opts; const char *name; void *dest; int len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } /* * This is a helper function for filesystems to traverse their * vnodes. See MNT_VNODE_FOREACH() in sys/mount.h */ struct vnode * __mnt_vnode_next(struct vnode **mvp, struct mount *mp) { struct vnode *vp; mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); if ((*mvp)->v_yield++ == 500) { MNT_IUNLOCK(mp); (*mvp)->v_yield = 0; uio_yield(); MNT_ILOCK(mp); } vp = TAILQ_NEXT(*mvp, v_nmntvnodes); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { __mnt_vnode_markerfree(mvp, mp); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); return (vp); } struct vnode * __mnt_vnode_first(struct vnode **mvp, struct mount *mp) { struct vnode *vp; mtx_assert(MNT_MTX(mp), MA_OWNED); vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { *mvp = NULL; return (NULL); } MNT_REF(mp); MNT_IUNLOCK(mp); *mvp = (struct vnode *) malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); (*mvp)->v_type = VMARKER; vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); MNT_ILOCK(mp); *mvp = NULL; MNT_REL(mp); return (NULL); } (*mvp)->v_mount = mp; TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); return (vp); } void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); MNT_ILOCK(mp); *mvp = NULL; MNT_REL(mp); } int __vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { int error; error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td); if (sbp != &mp->mnt_stat) *sbp = mp->mnt_stat; return (error); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, int flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 186196) +++ head/sys/kern/vfs_subr.c (revision 186197) @@ -1,4223 +1,4223 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #define WI_MPSAFEQ 0 #define WI_GIANTQ 1 static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure"); static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void vbusy(struct vnode *vp); static void vinactive(struct vnode *, struct thread *); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vfree(struct vnode *); static void vnlru_free(int); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static int vfs_knllocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); /* * Enable Giant pushdown based on whether or not the vm is mpsafe in this * build. Without mpsafevm the buffer cache can not run Giant free. */ int mpsafe_vfs = 1; TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, "MPSAFE VFS"); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed * vnode. */ static unsigned long numvnodes; SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending[2]; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* * Macros to control when a vnode is freed and recycled. All require * the vnode interlock. */ #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) /* * Initialize the vnode management data structures. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX 100000 #endif static void vntblinit(void *dummy __unused) { /* * Desiredvnodes is a function of the physical memory size and * the kernel's heap size. Specifically, desiredvnodes scales * in proportion to the physical memory size until two fifths * of the kernel's heap size is consumed by vnodes and vm * objects. */ desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Initialize the filesystem syncer. */ syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); MNT_ILOCK(mp); MNT_REF(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { - if (flags & MBF_NOWAIT) { + if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0); MNT_REL(mp); MNT_IUNLOCK(mp); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_lockref--; if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. */ struct mount * vfs_busyfs(fsid_t *fsid) { struct mount *mp; int error; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { mtx_unlock(&mountlist_mtx); return (NULL); } return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; /* * If the thread is jailed, but this is not a jail-friendly file * system, deny immediately. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted outside a jail and a jailed thread * tries to access it, deny immediately. */ if (!jailed(mp->mnt_cred) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted inside different jail that the jail of * the calling thread, deny immediately. */ if (jailed(mp->mnt_cred) && jailed(td->td_ucred) && mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) { return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static u_int16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); vgonel(vp); VOP_UNLOCK(vp, 0); vdropl(vp); done++; next_iter_mntunlocked: if ((count % 256) != 0) goto relock_mnt; goto yield; next_iter: if ((count % 256) != 0) continue; MNT_IUNLOCK(mp); yield: uio_yield(); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; int vfslocked; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); continue; } VNASSERT(VCANRECYCLE(vp), vp, ("vp inconsistent on freelist")); freevnodes--; vp->v_iflag &= ~VI_FREE; vholdl(vp); mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vtryrecycle(vp); VFS_UNLOCK_GIANT(vfslocked); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done; struct proc *p = vnlruproc; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); mtx_lock(&Giant); for (;;) { kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { int vfsunlocked; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (!VFS_NEEDSGIANT(mp)) { mtx_unlock(&Giant); vfsunlocked = 1; } else vfsunlocked = 0; done += vlrureclaim(mp); if (vfsunlocked) mtx_lock(&Giant); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (done == 0) { EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10); #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else uio_yield(); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ void vdestroy(struct vnode *vp) { struct bufobj *bo; CTR1(KTR_VFS, "vdestroy vp %p", vp); mtx_lock(&vnode_free_list_mtx); numvnodes--; mtx_unlock(&vnode_free_list_mtx); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) destroy_vpollinfo(vp->v_pollinfo); #ifdef INVARIANTS /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); uma_zfree(vnode_zone, vp); } /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) return (EWOULDBLOCK); /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) vgonel(vp); VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp); return (0); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp = NULL; struct bufobj *bo; mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); /* * Wait for available vnodes. */ if (numvnodes > desiredvnodes) { if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); goto alloc; } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (numvnodes > desiredvnodes) { mtx_unlock(&vnode_free_list_mtx); return (ENFILE); } #endif } alloc: numvnodes++; mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF); bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = 0; #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); else if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp); *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; /* XXX non mp-safe fs may still call insmntque with vnode unlocked */ if (!VOP_ISLOCKED(vp)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { int locked; KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); #ifdef DEBUG_VFS_LOCKS if (!VFS_NEEDSGIANT(mp)) ASSERT_VOP_ELOCKED(vp, "insmntque: mp-safe fs and non-locked vp"); #endif MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) { locked = VOP_ISLOCKED(vp); if (!locked || (locked == LK_EXCLUSIVE && (vp->v_vflag & VV_FORCEINSMQ) == 0)) { MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_UNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); VM_OBJECT_UNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_LOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: bo = &vp->v_bufobj; BO_LOCK(bo); anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } BO_LOCK(bo); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } BO_LOCK(bo); } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * buf_splay() - splay tree core for the clean/dirty list of buffers in * a vnode. * * NOTE: We have to deal with the special case of a background bitmap * buffer, a situation where two buffers will have the same logical * block offset. We want (1) only the foreground buffer to be accessed * in a lookup and (2) must differentiate between the foreground and * background buffer in the splay tree algorithm because the splay * tree cannot normally handle multiple entities with the same 'index'. * We accomplish this by adding differentiating flags to the splay tree's * numerical domain. */ static struct buf * buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) { struct buf dummy; struct buf *lefttreemax, *righttreemin, *y; if (root == NULL) return (NULL); lefttreemax = righttreemin = &dummy; for (;;) { if (lblkno < root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_left) == NULL) break; if (lblkno < y->b_lblkno) { /* Rotate right. */ root->b_left = y->b_right; y->b_right = root; root = y; if ((y = root->b_left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->b_left = root; righttreemin = root; } else if (lblkno > root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_right) == NULL) break; if (lblkno > y->b_lblkno) { /* Rotate left. */ root->b_right = y->b_left; y->b_left = root; root = y; if ((y = root->b_right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->b_right = root; lefttreemax = root; } else { break; } root = y; } /* Assemble the new root. */ lefttreemax->b_right = root->b_left; righttreemin->b_left = root->b_right; root->b_left = dummy.b_right; root->b_right = dummy.b_left; return (root); } static void buf_vlist_remove(struct buf *bp) { struct buf *root; struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_LOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; if (bp != bv->bv_root) { root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); KASSERT(root == bp, ("splay lookup failed in remove")); } if (bp->b_left == NULL) { root = bp->b_right; } else { root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); root->b_right = bp->b_right; } bv->bv_root = root; TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list using a * splay tree algorithm. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct buf *root; struct bufv *bv; ASSERT_BO_LOCKED(bo); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); if (root == NULL) { bp->b_left = NULL; bp->b_right = NULL; TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); } else if (bp->b_lblkno < root->b_lblkno || (bp->b_lblkno == root->b_lblkno && (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { bp->b_left = root->b_left; bp->b_right = root; root->b_left = NULL; TAILQ_INSERT_BEFORE(root, bp, b_bobufs); } else { bp->b_right = root->b_right; bp->b_left = root; root->b_right = NULL; TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); } bv->bv_cnt++; bv->bv_root = bp; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_clean.bv_root) != NULL) { bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } if ((bp = bo->bo_dirty.bv_root) != NULL) { bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } return (NULL); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_LOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT) bp->b_flags |= B_NEEDSGIANT; bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_flags &= ~B_NEEDSGIANT; bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int queue, slot; ASSERT_BO_LOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ : WI_MPSAFEQ; LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = (*bo)->__bo_vnode; /* XXX */ if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *gnext, *next; struct synclist *gslp, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; int error; last_work_seen = 0; syncer_final_iter = 0; first_printf = 1; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && LIST_EMPTY(gslp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } } if (!LIST_EMPTY(gslp)) { mtx_unlock(&sync_mtx); mtx_lock(&Giant); mtx_lock(&sync_mtx); while (!LIST_EMPTY(gslp)) { error = sync_vnode(gslp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(gnext, bo, bo_synclist); continue; } } mtx_unlock(&Giant); } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } vholdl(vp); } /* * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. */ static void v_upgrade_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VI_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. */ int vget(struct vnode *vp, int flags, struct thread *td) { int error; error = 0; VFS_ASSERT_GIANT(vp->v_mount); VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vget: invalid lock operation")); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); vholdl(vp); if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { vdrop(vp); return (error); } if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); VI_LOCK(vp); /* Upgrade our holdcnt to a usecount. */ v_upgrade_usecount(vp); /* * We don't guarantee that any particular close will * trigger inactive processing so just make a best effort * here at preventing a reference to a removed file. If * we don't succeed no harm is done. */ if (vp->v_iflag & VI_OWEINACT) { if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && (flags & LK_NOWAIT) == 0) vinactive(vp, td); vp->v_iflag &= ~VI_OWEINACT; } VI_UNLOCK(vp); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { struct thread *td = curthread; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vrele: missed vn_close")); if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); #endif VI_UNLOCK(vp); panic("vrele: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0); } else { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; } vdropl(vp); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) */ void vput(struct vnode *vp) { struct thread *td = curthread; /* XXX */ int error; KASSERT(vp != NULL, ("vput: null vp")); ASSERT_VOP_LOCKED(vp, "vput"); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vput: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { VOP_UNLOCK(vp, 0); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); vp->v_iflag |= VI_OWEINACT; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE|LK_INTERLOCK|LK_NOWAIT); VI_LOCK(vp); if (error) { if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; goto done; } } if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0); done: vdropl(vp); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } void vholdl(struct vnode *vp) { vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we will free it if it has been vgone'd otherwise it is * placed on the free list. */ void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, "vdropl"); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; if (vp->v_holdcnt == 0) { if (vp->v_iflag & VI_DOOMED) { vdestroy(vp); return; } else vfree(vp); } VI_UNLOCK(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ static void vinactive(struct vnode *vp, struct thread *td) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush( struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR1(KTR_VFS, "vflush: mp %p", mp); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0) return (error); vput(rootvp); } MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); vholdl(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0); vdrop(vp); MNT_ILOCK(mp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0); vdropl(vp); MNT_ILOCK(mp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0); vdropl(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp, 0); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) return (EBUSY); for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp, struct thread *td) { int recycled; ASSERT_VOP_ELOCKED(vp, "vrecycle"); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; struct mount *mp; CTR1(KTR_VFS, "vgonel: vp %p", vp); ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) vinvalbuf(vp, 0, 0, 0); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Clear the advisory locks and wake up waiting threads. */ lf_purgelocks(vp, &(vp->v_lockf)); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_TEXT) strlcat(buf, "|VV_TEXT", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_AGE) strlcat(buf, "|VI_AGE", sizeof(buf)); if (vp->v_iflag & VI_DOOMED) strlcat(buf, "|VI_DOOMED", sizeof(buf)); if (vp->v_iflag & VI_FREE) strlcat(buf, "|VI_FREE", sizeof(buf)); if (vp->v_iflag & VI_OBJDIRTY) strlcat(buf, "|VI_OBJDIRTY", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count); printf(" "); lockmgr_printinfo(vp->v_vnlock); printf("\n"); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp, *nmp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vprint("", vp); } nmp = TAILQ_NEXT(mp, mnt_list); } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct statfs *sp; struct vnode *vp; char buf[512]; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; flags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOINSMNTQ); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_MPSAFE); MNT_KERN_FLAG(MNTK_NOKNOTE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (mp->mnt_cred->cr_prison != NULL) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d\n", mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); db_printf(" mnt_noasync = %u\n", mp->mnt_noasync); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf("\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static void vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) { strcpy(xvfsp->vfc_name, vfsp->vfc_name); xvfsp->vfc_typenum = vfsp->vfc_typenum; xvfsp->vfc_refcount = vfsp->vfc_refcount; xvfsp->vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp->vfc_vfsops = NULL; xvfsp->vfc_next = NULL; } /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; struct xvfsconf xvfsp; int error; error = 0; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); if (error) break; } return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; struct xvfsconf xvfsp; printf("WARNING: userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp; struct thread *td; int error; KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OBJDIRTY) && (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { MNT_IUNLOCK(mp); if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); MNT_ILOCK(mp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_UNLOCK(obj); } vput(vp); } MNT_ILOCK(mp); } else VI_UNLOCK(vp); } MNT_IUNLOCK(mp); } /* * Mark a vnode as free, putting it up for recycling. */ static void vfree(struct vnode *vp) { CTR1(KTR_VFS, "vfree vp %p", vp); ASSERT_VI_LOCKED(vp, "vfree"); mtx_lock(&vnode_free_list_mtx); VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, ("vfree: Freeing doomed vnode")); if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); } /* * Opposite of vfree() - mark a vnode as in use. */ static void vbusy(struct vnode *vp) { CTR1(KTR_VFS, "vbusy vp %p", vp); ASSERT_VI_LOCKED(vp, "vbusy"); VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed.")); mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); mtx_unlock(&vnode_free_list_mtx); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knllocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp, 0); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; mtx_unlock(&sync_mtx); BO_UNLOCK(bo); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY, ap->a_td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } BO_UNLOCK(bo); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; error = 0; dev_lock(); if (vp->v_type != VCHR) error = ENOTBLK; else if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. * * The ifdef'd CAPABILITIES version is here for reference, but is not * actually used. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused) { accmode_t dac_granted; accmode_t priv_granted; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp != a->a_fvp) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } #endif } void vop_lookup_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); #endif } void vop_lookup_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; struct vnode *vp; a = ap; dvp = a->a_dvp; vp = *(a->a_vpp); ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); if (!rc) ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init(&fs_knlist, NULL, NULL, NULL, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { 1, NULL, filt_vfsdetach, filt_vfsread }; static struct filterops vfswrite_filtops = { 1, NULL, filt_vfsdetach, filt_vfswrite }; static struct filterops vfsvnode_filtops = { 1, NULL, filt_vfsdetach, filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0); } static int vfs_knllocked(void *arg) { struct vnode *vp = arg; return (VOP_ISLOCKED(vp) == LK_EXCLUSIVE); } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); kn->kn_data = va.va_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_fflags != 0); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VA_MARK_ATIME. This functionality is used by execve * and mmap, so we want to avoid the synchronous I/O implied by * directly setting va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct ucred *cred) { struct vattr atimeattr; if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) { VATTR_NULL(&atimeattr); atimeattr.va_vaflags |= VA_MARK_ATIME; (void)VOP_SETATTR(vp, &atimeattr, cred); } } Index: head/sys/sys/mount.h =================================================================== --- head/sys/sys/mount.h (revision 186196) +++ head/sys/sys/mount.h (revision 186197) @@ -1,780 +1,781 @@ /*- * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 * $FreeBSD$ */ #ifndef _SYS_MOUNT_H_ #define _SYS_MOUNT_H_ #include #include #ifdef _KERNEL #include #include #include #endif /* * NOTE: When changing statfs structure, mount structure, MNT_* flags or * MNTK_* flags also update DDB show mount command in vfs_subr.c. */ typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ /* * File identifier. * These are unique per filesystem on a single machine. */ #define MAXFIDSZ 16 struct fid { u_short fid_len; /* length of data in bytes */ u_short fid_reserved; /* force longword alignment */ char fid_data[MAXFIDSZ]; /* data (variable length) */ }; /* * filesystem statistics */ #define MFSNAMELEN 16 /* length of type name including null */ #define MNAMELEN 88 /* size of on/from name bufs */ #define STATFS_VERSION 0x20030518 /* current version number */ struct statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[MFSNAMELEN]; /* filesystem type name */ char f_mntfromname[MNAMELEN]; /* mounted filesystem */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ }; #ifdef _KERNEL #define OMFSNAMELEN 16 /* length of fs type name, including null */ #define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ /* XXX getfsstat.2 is out of date with write and read counter changes here. */ /* XXX statfs.2 is out of date with read counter changes here. */ struct ostatfs { long f_spare2; /* placeholder */ long f_bsize; /* fundamental filesystem block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in filesystem */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in filesystem */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* filesystem id */ uid_t f_owner; /* user that mounted the filesystem */ int f_type; /* type of filesystem */ int f_flags; /* copy of mount exported flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ char f_fstypename[OMFSNAMELEN]; /* fs type name */ char f_mntonname[OMNAMELEN]; /* directory on which mounted */ long f_syncreads; /* count of sync reads since mount */ long f_asyncreads; /* count of async reads since mount */ short f_spares1; /* unused spare */ char f_mntfromname[OMNAMELEN];/* mounted filesystem */ short f_spares2; /* unused spare */ /* * XXX on machines where longs are aligned to 8-byte boundaries, there * is an unnamed int32_t here. This spare was after the apparent end * of the struct until we bit off the read counters from f_mntonname. */ long f_spare[2]; /* unused spare */ }; #define MMAXOPTIONLEN 65536 /* maximum length of a mount option */ TAILQ_HEAD(vnodelst, vnode); struct vfsoptlist; struct vfsopt; /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are * put on a doubly linked list. * * Lock reference: * m - mountlist_mtx * i - interlock * * Unmarked fields are considered stable as long as a ref is held. * */ struct mount { struct mtx mnt_mtx; /* mount structure interlock */ int mnt_gen; /* struct mount generation */ #define mnt_startzero mnt_list TAILQ_ENTRY(mount) mnt_list; /* (m) mount list */ struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ struct vnode *mnt_vnodecovered; /* vnode we mounted on */ struct vnode *mnt_syncer; /* syncer vnode */ int mnt_ref; /* (i) Reference count */ struct vnodelst mnt_nvnodelist; /* (i) list of vnodes */ int mnt_nvnodelistsize; /* (i) # of vnodes */ int mnt_writeopcount; /* (i) write syscalls pending */ int mnt_kern_flag; /* (i) kernel only flags */ u_int mnt_flag; /* (i) flags shared with user */ u_int mnt_noasync; /* (i) # noasync overrides */ struct vfsoptlist *mnt_opt; /* current mount options */ struct vfsoptlist *mnt_optnew; /* new options passed to fs */ int mnt_maxsymlinklen; /* max size of short symlink */ struct statfs mnt_stat; /* cache of filesystem stats */ struct ucred *mnt_cred; /* credentials of mounter */ void * mnt_data; /* private data */ time_t mnt_time; /* last time written*/ int mnt_iosize_max; /* max size for clusters, etc */ struct netexport *mnt_export; /* export list */ struct label *mnt_label; /* MAC label for the fs */ u_int mnt_hashseed; /* Random seed for vfs_hash */ int mnt_lockref; /* (i) Lock reference count */ int mnt_secondary_writes; /* (i) # of secondary writes */ int mnt_secondary_accwrites;/* (i) secondary wr. starts */ struct thread *mnt_susp_owner; /* (i) thread owning suspension */ #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ struct lock mnt_explock; /* vfs_export walkers lock */ }; struct vnode *__mnt_vnode_next(struct vnode **mvp, struct mount *mp); struct vnode *__mnt_vnode_first(struct vnode **mvp, struct mount *mp); void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp); #define MNT_VNODE_FOREACH(vp, mp, mvp) \ for (vp = __mnt_vnode_first(&(mvp), (mp)); \ (vp) != NULL; vp = __mnt_vnode_next(&(mvp), (mp))) #define MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp) \ __mnt_vnode_markerfree(&(mvp), (mp)) #define MNT_VNODE_FOREACH_ABORT(mp, mvp) \ do { \ MNT_ILOCK(mp); \ MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); \ MNT_IUNLOCK(mp); \ } while (0) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) #define MNT_REF(mp) (mp)->mnt_ref++ #define MNT_REL(mp) do { \ (mp)->mnt_ref--; \ if ((mp)->mnt_ref == 0) \ wakeup((mp)); \ } while (0) #endif /* _KERNEL */ /* * User specifiable flags. */ #define MNT_RDONLY 0x00000001 /* read only filesystem */ #define MNT_SYNCHRONOUS 0x00000002 /* filesystem written synchronously */ #define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */ #define MNT_NOSUID 0x00000008 /* don't honor setuid bits on fs */ #define MNT_UNION 0x00000020 /* union with underlying filesystem */ #define MNT_ASYNC 0x00000040 /* filesystem written asynchronously */ #define MNT_SUIDDIR 0x00100000 /* special handling of SUID on dirs */ #define MNT_SOFTDEP 0x00200000 /* soft updates being done */ #define MNT_NOSYMFOLLOW 0x00400000 /* do not follow symlinks */ #define MNT_GJOURNAL 0x02000000 /* GEOM journal support enabled */ #define MNT_MULTILABEL 0x04000000 /* MAC support for individual objects */ #define MNT_ACLS 0x08000000 /* ACL support enabled */ #define MNT_NOATIME 0x10000000 /* disable update of file access time */ #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */ #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */ /* * NFS export related mount flags. */ #define MNT_EXRDONLY 0x00000080 /* exported read only */ #define MNT_EXPORTED 0x00000100 /* filesystem is exported */ #define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ #define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone */ #define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping */ #define MNT_EXPUBLIC 0x20000000 /* public export (WebNFS) */ /* * Flags set by internal operations, * but visible to the user. * XXX some of these are not quite right.. (I've never seen the root flag set) */ #define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ #define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ #define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ #define MNT_USER 0x00008000 /* mounted by a user */ #define MNT_IGNORE 0x00800000 /* do not show entry in df */ /* * Mask of flags that are visible to statfs(). * XXX I think that this could now become (~(MNT_CMDFLAGS)) * but the 'mount' program may need changing to handle this. */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_UNION | \ MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_USER | MNT_QUOTA | \ MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ MNT_NOATIME | \ MNT_NOSYMFOLLOW | MNT_IGNORE | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS | MNT_USER) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. * XXX: These are not STATES and really should be somewhere else. * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for * mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless. */ #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ #define MNT_DELEXPORT 0x00020000 /* delete export host lists */ #define MNT_RELOAD 0x00040000 /* reload filesystem data */ #define MNT_FORCE 0x00080000 /* force unmount or readonly change */ #define MNT_SNAPSHOT 0x01000000 /* snapshot the filesystem */ #define MNT_BYFSID 0x08000000 /* specify filesystem by ID. */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_BYFSID) /* * Still available. */ #define MNT_SPARE_0x00000010 0x00000010 /* * Internal filesystem control flags stored in mnt_kern_flag. * * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed * past the mount point. This keeps the subtree stable during mounts * and unmounts. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). * * MNTK_NOINSMNTQ is strict subset of MNTK_UNMOUNT. They are separated * to allow for failed unmount attempt to restore the syncer vnode for * the mount. */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_ASYNC 0x00000002 /* filtered async flag */ #define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */ #define MNTK_NOINSMNTQ 0x00000008 /* insmntque is not allowed */ #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ +#define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPEND2 0x04000000 /* block secondary writes */ #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ #define MNTK_MPSAFE 0x20000000 /* Filesystem is MPSAFE. */ #define MNTK_NOKNOTE 0x80000000 /* Don't send KNOTEs from VOP hooks */ #define MNTK_LOOKUP_SHARED 0x40000000 /* FS supports shared lock lookups */ /* * Sysctl CTL_VFS definitions. * * Second level identifier specifies which filesystem. Second level * identifier VFS_VFSCONF returns information about all filesystems. * Second level identifier VFS_GENERIC is non-terminal. */ #define VFS_VFSCONF 0 /* get configured filesystems */ #define VFS_GENERIC 0 /* generic filesystem information */ /* * Third level identifiers for VFS_GENERIC are given below; third * level identifiers for specific filesystems are given in their * mount specific header files. */ #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ /* * Flags for various system call interfaces. * * waitfor flags to vfs_sync() and getfsstat() */ #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_LAZY 3 /* push data not written by filesystem syncer */ #define MNT_SUSPEND 4 /* Suspend file system after sync */ /* * Generic file handle */ struct fhandle { fsid_t fh_fsid; /* Filesystem id of mount point */ struct fid fh_fid; /* Filesys specific id */ }; typedef struct fhandle fhandle_t; /* * Old export arguments without security flavor list */ struct oexport_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ }; /* * Export arguments for local filesystem mount calls. */ #define MAXSECFLAVORS 5 struct export_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ int ex_numsecflavors; /* security flavor count */ int ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */ }; /* * Structure holding information for a publicly exported filesystem * (WebNFS). Currently the specs allow just for one such filesystem. */ struct nfs_public { int np_valid; /* Do we hold valid information */ fhandle_t np_handle; /* Filehandle for pub fs (internal) */ struct mount *np_mount; /* Mountpoint of exported fs */ char *np_index; /* Index file */ }; /* * Filesystem configuration information. One of these exists for each * type of filesystem supported by the kernel. These are searched at * mount time to identify the requested filesystem. * * XXX: Never change the first two arguments! */ struct vfsconf { u_int vfc_version; /* ABI version number */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ struct vfsops *vfc_vfsops; /* filesystem operations vector */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsoptdecl *vfc_opts; /* mount options */ TAILQ_ENTRY(vfsconf) vfc_list; /* list of vfscons */ }; /* Userland version of the struct vfsconf. */ struct xvfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsconf *vfc_next; /* next in list */ }; #ifndef BURN_BRIDGES struct ovfsconf { void *vfc_vfsops; char vfc_name[32]; int vfc_index; int vfc_refcount; int vfc_flags; }; #endif /* * NB: these flags refer to IMPLEMENTATION properties, not properties of * any actual mounts; i.e., it does not make sense to change the flags. */ #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ #define VFCF_NETWORK 0x00020000 /* may get data over the network */ #define VFCF_READONLY 0x00040000 /* writes are not implemented */ #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files */ #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode */ #define VFCF_JAIL 0x00400000 /* can be mounted from within a jail */ #define VFCF_DELEGADMIN 0x00800000 /* supports delegated administration */ typedef uint32_t fsctlop_t; struct vfsidctl { int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ fsid_t vc_fsid; /* fsid to operate on */ char vc_fstypename[MFSNAMELEN]; /* type of fs 'nfs' or '*' */ fsctlop_t vc_op; /* operation VFS_CTL_* (below) */ void *vc_ptr; /* pointer to data structure */ size_t vc_len; /* sizeof said structure */ u_int32_t vc_spare[12]; /* spare (must be zero) */ }; /* vfsidctl API version. */ #define VFS_CTL_VERS1 0x01 /* * New style VFS sysctls, do not reuse/conflict with the namespace for * private sysctls. * All "global" sysctl ops have the 33rd bit set: * 0x...1.... * Private sysctl ops should have the 33rd bit unset. */ #define VFS_CTL_QUERY 0x00010001 /* anything wrong? (vfsquery) */ #define VFS_CTL_TIMEO 0x00010002 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010003 /* disable file locking */ struct vfsquery { u_int32_t vq_flags; u_int32_t vq_spare[31]; }; /* vfsquery flags */ #define VQ_NOTRESP 0x0001 /* server down */ #define VQ_NEEDAUTH 0x0002 /* server bad auth */ #define VQ_LOWDISK 0x0004 /* we're low on space */ #define VQ_MOUNT 0x0008 /* new filesystem arrived */ #define VQ_UNMOUNT 0x0010 /* filesystem has left */ #define VQ_DEAD 0x0020 /* filesystem is dead, needs force unmount */ #define VQ_ASSIST 0x0040 /* filesystem needs assistance from external program */ #define VQ_NOTRESPLOCK 0x0080 /* server lockd down */ #define VQ_FLAG0100 0x0100 /* placeholder */ #define VQ_FLAG0200 0x0200 /* placeholder */ #define VQ_FLAG0400 0x0400 /* placeholder */ #define VQ_FLAG0800 0x0800 /* placeholder */ #define VQ_FLAG1000 0x1000 /* placeholder */ #define VQ_FLAG2000 0x2000 /* placeholder */ #define VQ_FLAG4000 0x4000 /* placeholder */ #define VQ_FLAG8000 0x8000 /* placeholder */ #ifdef _KERNEL /* Point a sysctl request at a vfsidctl's data. */ #define VCTLTOREQ(vc, req) \ do { \ (req)->newptr = (vc)->vc_ptr; \ (req)->newlen = (vc)->vc_len; \ (req)->newidx = 0; \ } while (0) #endif struct iovec; struct uio; #ifdef _KERNEL /* * vfs_busy specific flags and mask. */ #define MBF_NOWAIT 0x01 #define MBF_MNTLSTLOCK 0x02 #define MBF_MASK (MBF_NOWAIT | MBF_MNTLSTLOCK) #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MOUNT); #endif extern int maxvfsconf; /* highest defined filesystem type */ extern int nfs_mount_type; /* vfc_typenum for nfs, or -1 */ TAILQ_HEAD(vfsconfhead, vfsconf); extern struct vfsconfhead vfsconf; /* * Operations supported on mounted filesystem. */ struct mount_args; struct nameidata; struct sysctl_req; struct mntarg; typedef int vfs_cmount_t(struct mntarg *ma, void *data, int flags, struct thread *td); typedef int vfs_unmount_t(struct mount *mp, int mntflags, struct thread *td); typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp, struct thread *td); typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg, struct thread *td); typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp, struct thread *td); typedef int vfs_sync_t(struct mount *mp, int waitfor, struct thread *td); typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, struct vnode **vpp); typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); typedef int vfs_init_t(struct vfsconf *); typedef int vfs_uninit_t(struct vfsconf *); typedef int vfs_extattrctl_t(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td); typedef int vfs_mount_t(struct mount *mp, struct thread *td); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); struct vfsops { vfs_mount_t *vfs_mount; vfs_cmount_t *vfs_cmount; vfs_unmount_t *vfs_unmount; vfs_root_t *vfs_root; vfs_quotactl_t *vfs_quotactl; vfs_statfs_t *vfs_statfs; vfs_sync_t *vfs_sync; vfs_vget_t *vfs_vget; vfs_fhtovp_t *vfs_fhtovp; vfs_checkexp_t *vfs_checkexp; vfs_init_t *vfs_init; vfs_uninit_t *vfs_uninit; vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; }; vfs_statfs_t __vfs_statfs; #define VFS_MOUNT(MP, P) (*(MP)->mnt_op->vfs_mount)(MP, P) #define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P) #define VFS_ROOT(MP, FLAGS, VPP, P) \ (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP, P) #define VFS_QUOTACTL(MP,C,U,A,P) (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P) #define VFS_STATFS(MP, SBP, P) __vfs_statfs((MP), (SBP), (P)) #define VFS_SYNC(MP, WAIT, P) (*(MP)->mnt_op->vfs_sync)(MP, WAIT, P) #define VFS_VGET(MP, INO, FLAGS, VPP) \ (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP) #define VFS_FHTOVP(MP, FIDP, VPP) \ (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, VPP) #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED, NUMSEC, SEC) \ (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED, NUMSEC, SEC) #define VFS_EXTATTRCTL(MP, C, FN, NS, N, P) \ (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N, P) #define VFS_SYSCTL(MP, OP, REQ) \ (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ) #define VFS_SUSP_CLEAN(MP) \ ({if (*(MP)->mnt_op->vfs_susp_clean != NULL) \ (*(MP)->mnt_op->vfs_susp_clean)(MP); }) extern int mpsafe_vfs; #define VFS_NEEDSGIANT_(MP) \ (!mpsafe_vfs || ((MP) != NULL && ((MP)->mnt_kern_flag & MNTK_MPSAFE) == 0)) #define VFS_NEEDSGIANT(MP) __extension__ \ ({ \ struct mount *_mp; \ _mp = (MP); \ VFS_NEEDSGIANT_(_mp); \ }) #define VFS_LOCK_GIANT(MP) __extension__ \ ({ \ int _locked; \ struct mount *_mp; \ _mp = (MP); \ if (VFS_NEEDSGIANT_(_mp)) { \ mtx_lock(&Giant); \ _locked = 1; \ } else \ _locked = 0; \ _locked; \ }) #define VFS_UNLOCK_GIANT(locked) if ((locked)) mtx_unlock(&Giant); #define VFS_ASSERT_GIANT(MP) do \ { \ struct mount *_mp; \ _mp = (MP); \ if (VFS_NEEDSGIANT_(_mp)) \ mtx_assert(&Giant, MA_OWNED); \ } while (0) #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 1); \ } while (0) #define VFS_KNOTE_UNLOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 0); \ } while (0) #include /* * Version numbers. */ #define VFS_VERSION_00 0x19660120 #define VFS_VERSION VFS_VERSION_00 #define VFS_SET(vfsops, fsname, flags) \ static struct vfsconf fsname ## _vfsconf = { \ .vfc_version = VFS_VERSION, \ .vfc_name = #fsname, \ .vfc_vfsops = &vfsops, \ .vfc_typenum = -1, \ .vfc_flags = flags, \ }; \ static moduledata_t fsname ## _mod = { \ #fsname, \ vfs_modevent, \ & fsname ## _vfsconf \ }; \ DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) extern char *mountrootfsname; /* * exported vnode operations */ int dounmount(struct mount *, int, struct thread *); int kernel_mount(struct mntarg *ma, int flags); int kernel_vmount(int flags, ...); struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len); struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name); struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...); struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len); void statfs_scale_blocks(struct statfs *sf, long max_size); struct vfsconf *vfs_byname(const char *); struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *); void vfs_mount_destroy(struct mount *); void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); void vfs_freeopts(struct vfsoptlist *opts); void vfs_deleteopt(struct vfsoptlist *opts, const char *name); int vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val); int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); char *vfs_getopts(struct vfsoptlist *, const char *, int *error); int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); int vfs_filteropt(struct vfsoptlist *, const char **legal); int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); void vfs_msync(struct mount *, int); int vfs_busy(struct mount *, int); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); int vfs_allocate_syncvnode(struct mount *); int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions); void vfs_getnewfsid(struct mount *); struct cdev *vfs_getrootfsid(struct mount *); struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ struct mount *vfs_busyfs(fsid_t *); int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); void vfs_ref(struct mount *); void vfs_rel(struct mount *); struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *, struct ucred *); int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx mountlist_mtx; extern struct nfs_public nfs_pub; /* * Declarations for these vfs default operations are located in * kern/vfs_default.c, they should be used instead of making "dummy" * functions or casting entries in the VFS op table to "enopnotsupp()". */ vfs_root_t vfs_stdroot; vfs_quotactl_t vfs_stdquotactl; vfs_statfs_t vfs_stdstatfs; vfs_sync_t vfs_stdsync; vfs_sync_t vfs_stdnosync; vfs_vget_t vfs_stdvget; vfs_fhtovp_t vfs_stdfhtovp; vfs_checkexp_t vfs_stdcheckexp; vfs_init_t vfs_stdinit; vfs_uninit_t vfs_stduninit; vfs_extattrctl_t vfs_stdextattrctl; vfs_sysctl_t vfs_stdsysctl; #else /* !_KERNEL */ #include struct stat; __BEGIN_DECLS int fhopen(const struct fhandle *, int); int fhstat(const struct fhandle *, struct stat *); int fhstatfs(const struct fhandle *, struct statfs *); int fstatfs(int, struct statfs *); int getfh(const char *, fhandle_t *); int getfsstat(struct statfs *, long, int); int getmntinfo(struct statfs **, int); int lgetfh(const char *, fhandle_t *); int mount(const char *, const char *, int, void *); int nmount(struct iovec *, unsigned int, int); int statfs(const char *, struct statfs *); int unmount(const char *, int); /* C library stuff */ int getvfsbyname(const char *, struct xvfsconf *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_MOUNT_H_ */