diff --git a/sys/fs/cd9660/cd9660_lookup.c b/sys/fs/cd9660/cd9660_lookup.c index 1adb7a18bcf1..75fcdc9152cd 100644 --- a/sys/fs/cd9660/cd9660_lookup.c +++ b/sys/fs/cd9660/cd9660_lookup.c @@ -1,463 +1,463 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include struct cd9660_ino_alloc_arg { ino_t ino; ino_t i_ino; struct iso_directory_record *ep; }; static int cd9660_ino_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { struct cd9660_ino_alloc_arg *dd_arg; dd_arg = arg; return (cd9660_vget_internal(mp, dd_arg->i_ino, lkflags, vpp, dd_arg->i_ino != dd_arg->ino, dd_arg->ep)); } /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the filesystem is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and iput * instead of two iputs. * * Overall outline of ufs_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache * * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode unlocked. */ int cd9660_lookup(struct vop_cachedlookup_args *ap) { struct vnode *vdp; /* vnode for directory being searched */ struct iso_node *dp; /* inode for directory being searched */ struct iso_mnt *imp; /* filesystem that directory is in */ struct buf *bp; /* a buffer of directory entries */ struct iso_directory_record *ep;/* the current directory entry */ struct iso_directory_record *ep2;/* copy of current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ int saveoffset = 0; /* offset of last directory entry in dir */ doff_t i_diroff; /* cached i_diroff value. */ doff_t i_offset; /* cached i_offset value. */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by cd9660_vget_internal */ struct cd9660_ino_alloc_arg dd_arg; u_long bmask; /* block offset mask */ int error; ino_t ino, i_ino; int ltype, reclen; u_short namelen; int isoflags; char altname[NAME_MAX]; int res; int assoc, len; char *name; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; - int flags = cnp->cn_flags; + uint64_t flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; ep2 = ep = NULL; bp = NULL; *vpp = NULL; vdp = ap->a_dvp; dp = VTOI(vdp); imp = dp->i_mnt; /* * We now have a segment name to search for, and a directory to search. */ ino = reclen = 0; i_diroff = dp->i_diroff; len = cnp->cn_namelen; name = cnp->cn_nameptr; /* * A leading `=' means, we are looking for an associated file */ if ((assoc = (imp->iso_ftype != ISO_FTYPE_RRIP && *name == ASSOCCHAR))) { len--; name++; } /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ bmask = imp->im_bmask; if (nameiop != LOOKUP || i_diroff == 0 || i_diroff > dp->i_size) { entryoffsetinblock = 0; i_offset = 0; numdirpasses = 1; } else { i_offset = i_diroff; if ((entryoffsetinblock = i_offset & bmask) && (error = cd9660_blkatoff(vdp, (off_t)i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } endsearch = dp->i_size; searchloop: while (i_offset < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if ((i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)i_offset, NULL, &bp)) != 0) return (error); entryoffsetinblock = 0; } /* * Get pointer to next entry. */ ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { /* skip to next block, if any */ i_offset = (i_offset & ~bmask) + imp->logical_block_size; continue; } if (reclen < ISO_DIRECTORY_RECORD_SIZE) /* illegal entry, stop */ break; if (entryoffsetinblock + reclen > imp->logical_block_size) /* entries are not allowed to cross boundaries */ break; namelen = isonum_711(ep->name_len); isoflags = isonum_711(imp->iso_ftype == ISO_FTYPE_HIGH_SIERRA? &ep->date[6]: ep->flags); if (reclen < ISO_DIRECTORY_RECORD_SIZE + namelen) /* illegal entry, stop */ break; /* * Check for a name match. */ switch (imp->iso_ftype) { default: if (!(isoflags & 4) == !assoc) { if ((len == 1 && *name == '.') || (flags & ISDOTDOT)) { if (namelen == 1 && ep->name[0] == ((flags & ISDOTDOT) ? 1 : 0)) { /* * Save directory entry's inode number and * release directory buffer. */ i_ino = isodirino(ep, imp); goto found; } if (namelen != 1 || ep->name[0] != 0) goto notfound; } else if (!(res = isofncmp(name, len, ep->name, namelen, imp->joliet_level, imp->im_flags, imp->im_d2l, imp->im_l2d))) { if (isoflags & 2) ino = isodirino(ep, imp); else ino = dbtob(bp->b_blkno) + entryoffsetinblock; saveoffset = i_offset; } else if (ino) goto foundino; #ifdef NOSORTBUG /* On some CDs directory entries are not sorted correctly */ else if (res < 0) goto notfound; else if (res > 0 && numdirpasses == 2) numdirpasses++; #endif } break; case ISO_FTYPE_RRIP: if (isonum_711(ep->flags)&2) ino = isodirino(ep, imp); else ino = dbtob(bp->b_blkno) + entryoffsetinblock; i_ino = ino; cd9660_rrip_getname(ep, altname, &namelen, &i_ino, imp); if (namelen == cnp->cn_namelen && !bcmp(name,altname,namelen)) goto found; ino = 0; break; } i_offset += reclen; entryoffsetinblock += reclen; } if (ino) { foundino: i_ino = ino; if (saveoffset != i_offset) { if (lblkno(imp, i_offset) != lblkno(imp, saveoffset)) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)saveoffset, NULL, &bp)) != 0) return (error); } entryoffsetinblock = saveoffset & bmask; ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); i_offset = saveoffset; } goto found; } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; i_offset = 0; endsearch = i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * Insert name into cache (as non-existent) if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); if (nameiop == CREATE || nameiop == RENAME) return (EROFS); return (ENOENT); found: if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = i_offset; /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the `vget' for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the filesystem has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; /* * Make a copy of the directory entry for non "." lookups so * we can drop the buffer before calling vget() to avoid a * lock order reversal between the vnode lock and the buffer * lock. */ if (dp->i_number != i_ino) { ep2 = malloc(reclen, M_TEMP, M_WAITOK); memcpy(ep2, ep, reclen); ep = ep2; } brelse(bp); /* * If ino is different from i_ino, * it's a relocated directory. */ if (flags & ISDOTDOT) { dd_arg.ino = ino; dd_arg.i_ino = i_ino; dd_arg.ep = ep; error = vn_vget_ino_gen(pdp, cd9660_ino_alloc, &dd_arg, cnp->cn_lkflags, &tdp); free(ep2, M_TEMP); if (error != 0) return (error); *vpp = tdp; } else if (dp->i_number == i_ino) { VREF(vdp); /* we want ourself, ie "." */ /* * When we lookup "." we still can be asked to lock it * differently. */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(vdp)) { if (ltype == LK_EXCLUSIVE) vn_lock(vdp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(vdp, LK_DOWNGRADE | LK_RETRY); } *vpp = vdp; } else { error = cd9660_vget_internal(vdp->v_mount, i_ino, cnp->cn_lkflags, &tdp, i_ino != ino, ep); free(ep2, M_TEMP); if (error) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ int cd9660_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp) { struct iso_node *ip; struct iso_mnt *imp; struct buf *bp; daddr_t lbn; int bsize, bshift, error; ip = VTOI(vp); imp = ip->i_mnt; lbn = lblkno(imp, offset); bsize = blksize(imp, ip, lbn); bshift = imp->im_bshift; if ((error = bread(vp, lbn, bsize, NOCRED, &bp)) != 0) { brelse(bp); *bpp = NULL; return (error); } /* * We must BMAP the buffer because the directory code may use b_blkno * to calculate the inode for certain types of directory entries. * We could get away with not doing it before we VMIO-backed the * directories because the buffers would get freed atomically with * the invalidation of their data. But with VMIO-backed buffers * the buffers may be freed and then later reconstituted - and the * reconstituted buffer will have no knowledge of b_blkno. */ if (bp->b_blkno == bp->b_lblkno) { bp->b_blkno = (ip->iso_start + bp->b_lblkno) << (bshift - DEV_BSHIFT); } if (res) *res = (char *)bp->b_data + blkoff(imp, offset); *bpp = bp; return (0); } diff --git a/sys/fs/fuse/fuse_vnops.c b/sys/fs/fuse/fuse_vnops.c index 016b634c4fa0..83f527ec8ec0 100644 --- a/sys/fs/fuse/fuse_vnops.c +++ b/sys/fs/fuse/fuse_vnops.c @@ -1,3245 +1,3244 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_io.h" #include /* Maximum number of hardlinks to a single FUSE file */ #define FUSE_LINK_MAX UINT32_MAX SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_advlock_t fuse_vnop_advlock; static vop_allocate_t fuse_vnop_allocate; static vop_bmap_t fuse_vnop_bmap; static vop_close_t fuse_fifo_close; static vop_close_t fuse_vnop_close; static vop_copy_file_range_t fuse_vnop_copy_file_range; static vop_create_t fuse_vnop_create; static vop_deallocate_t fuse_vnop_deallocate; static vop_deleteextattr_t fuse_vnop_deleteextattr; static vop_fdatasync_t fuse_vnop_fdatasync; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_getextattr_t fuse_vnop_getextattr; static vop_inactive_t fuse_vnop_inactive; static vop_ioctl_t fuse_vnop_ioctl; static vop_link_t fuse_vnop_link; static vop_listextattr_t fuse_vnop_listextattr; static vop_lookup_t fuse_vnop_lookup; static vop_mkdir_t fuse_vnop_mkdir; static vop_mknod_t fuse_vnop_mknod; static vop_open_t fuse_vnop_open; static vop_pathconf_t fuse_vnop_pathconf; static vop_read_t fuse_vnop_read; static vop_readdir_t fuse_vnop_readdir; static vop_readlink_t fuse_vnop_readlink; static vop_reclaim_t fuse_vnop_reclaim; static vop_remove_t fuse_vnop_remove; static vop_rename_t fuse_vnop_rename; static vop_rmdir_t fuse_vnop_rmdir; static vop_setattr_t fuse_vnop_setattr; static vop_setextattr_t fuse_vnop_setextattr; static vop_strategy_t fuse_vnop_strategy; static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; static vop_print_t fuse_vnop_print; static vop_vptofh_t fuse_vnop_vptofh; struct vop_vector fuse_fifoops = { .vop_default = &fifo_specops, .vop_access = fuse_vnop_access, .vop_close = fuse_fifo_close, .vop_fsync = fuse_vnop_fsync, .vop_getattr = fuse_vnop_getattr, .vop_inactive = fuse_vnop_inactive, .vop_pathconf = fuse_vnop_pathconf, .vop_print = fuse_vnop_print, .vop_read = VOP_PANIC, .vop_reclaim = fuse_vnop_reclaim, .vop_setattr = fuse_vnop_setattr, .vop_write = VOP_PANIC, .vop_vptofh = fuse_vnop_vptofh, }; VFS_VOP_VECTOR_REGISTER(fuse_fifoops); struct vop_vector fuse_vnops = { .vop_allocate = fuse_vnop_allocate, .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_advlock = fuse_vnop_advlock, .vop_bmap = fuse_vnop_bmap, .vop_close = fuse_vnop_close, .vop_copy_file_range = fuse_vnop_copy_file_range, .vop_create = fuse_vnop_create, .vop_deallocate = fuse_vnop_deallocate, .vop_deleteextattr = fuse_vnop_deleteextattr, .vop_fsync = fuse_vnop_fsync, .vop_fdatasync = fuse_vnop_fdatasync, .vop_getattr = fuse_vnop_getattr, .vop_getextattr = fuse_vnop_getextattr, .vop_inactive = fuse_vnop_inactive, .vop_ioctl = fuse_vnop_ioctl, .vop_link = fuse_vnop_link, .vop_listextattr = fuse_vnop_listextattr, .vop_lookup = fuse_vnop_lookup, .vop_mkdir = fuse_vnop_mkdir, .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = fuse_vnop_pathconf, /* * TODO: implement vop_poll after upgrading to protocol 7.21. * FUSE_POLL was added in protocol 7.11, but it's kind of broken until * 7.21, which adds the ability for the client to choose which poll * events it wants, and for a client to deregister a file handle */ .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, .vop_reclaim = fuse_vnop_reclaim, .vop_remove = fuse_vnop_remove, .vop_rename = fuse_vnop_rename, .vop_rmdir = fuse_vnop_rmdir, .vop_setattr = fuse_vnop_setattr, .vop_setextattr = fuse_vnop_setextattr, .vop_strategy = fuse_vnop_strategy, .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, .vop_print = fuse_vnop_print, .vop_vptofh = fuse_vnop_vptofh, }; VFS_VOP_VECTOR_REGISTER(fuse_vnops); /* Check permission for extattr operations, much like extattr_check_cred */ static int fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred, struct thread *td, accmode_t accmode) { struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); int default_permissions = data->dataflags & FSESS_DEFAULT_PERMISSIONS; /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (ns) { case EXTATTR_NAMESPACE_SYSTEM: if (default_permissions) { return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); } return (0); case EXTATTR_NAMESPACE_USER: if (default_permissions) { return (fuse_internal_access(vp, accmode, td, cred)); } return (0); default: return (EPERM); } } /* Get a filehandle for a directory */ static int fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0) return 0; return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid); } /* Send FUSE_FLUSH for this vnode */ static int fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) { struct fuse_flush_in *ffi; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct thread *td = curthread; struct mount *mp = vnode_mount(vp); int err; if (fsess_not_impl(vnode_mount(vp), FUSE_FLUSH)) return 0; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (err) return err; fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; /* * If the file has a POSIX lock then we're supposed to set lock_owner. * If not, then lock_owner is undefined. So we may as well always set * it. */ ffi->lock_owner = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FLUSH); err = 0; } fdisp_destroy(&fdi); return err; } /* Close wrapper for fifos. */ static int fuse_fifo_close(struct vop_close_args *ap) { return (fifo_specops.vop_close(ap)); } /* Invalidate a range of cached data, whether dirty of not */ static int fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end) { struct buf *bp; daddr_t left_lbn, end_lbn, right_lbn; off_t new_filesize; int iosize, left_on, right_on, right_blksize; iosize = fuse_iosize(vp); left_lbn = start / iosize; end_lbn = howmany(end, iosize); left_on = start & (iosize - 1); if (left_on != 0) { bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0); if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) { /* * Flush the dirty buffer, because we don't have a * byte-granular way to record which parts of the * buffer are valid. */ bwrite(bp); if (bp->b_error) return (bp->b_error); } else { brelse(bp); } } right_on = end & (iosize - 1); if (right_on != 0) { right_lbn = end / iosize; new_filesize = MAX(filesize, end); right_blksize = MIN(iosize, new_filesize - iosize * right_lbn); bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0); if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) { /* * Flush the dirty buffer, because we don't have a * byte-granular way to record which parts of the * buffer are valid. */ bwrite(bp); if (bp->b_error) return (bp->b_error); } else { brelse(bp); } } v_inval_buf_range(vp, left_lbn, end_lbn, iosize); return (0); } /* Send FUSE_LSEEK for this node */ static int fuse_vnop_do_lseek(struct vnode *vp, struct thread *td, struct ucred *cred, pid_t pid, off_t *offp, int whence) { struct fuse_dispatcher fdi; struct fuse_filehandle *fufh; struct fuse_lseek_in *flsi; struct fuse_lseek_out *flso; struct mount *mp = vnode_mount(vp); int err; ASSERT_VOP_LOCKED(vp, __func__); err = fuse_filehandle_getrw(vp, FREAD, &fufh, cred, pid); if (err) return (err); fdisp_init(&fdi, sizeof(*flsi)); fdisp_make_vp(&fdi, FUSE_LSEEK, vp, td, cred); flsi = fdi.indata; flsi->fh = fufh->fh_id; flsi->offset = *offp; flsi->whence = whence; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LSEEK); } else if (err == ENXIO) { /* Note: ENXIO means "no more hole/data regions until EOF" */ fsess_set_impl(mp, FUSE_LSEEK); } else if (err == 0) { fsess_set_impl(mp, FUSE_LSEEK); flso = fdi.answ; *offp = flso->offset; } fdisp_destroy(&fdi); return (err); } /* struct vnop_access_args { struct vnode *a_vp; #if VOP_ACCESS_TAKES_ACCMODE_T accmode_t a_accmode; #else int a_mode; #endif struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; if (fuse_isdeadfs(vp)) { if (vnode_isvroot(vp)) { return 0; } return ENXIO; } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { if (priv_check_cred(cred, PRIV_VFS_ADMIN) || (fuse_match_cred(data->daemoncred, cred) == 0)) { return 0; } } return EBADF; } if (vnode_islnk(vp)) { return 0; } err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred); return err; } /* * struct vop_advlock_args { * struct vop_generic_args a_gen; * struct vnode *a_vp; * void *a_id; * int a_op; * struct flock *a_fl; * int a_flags; * } */ static int fuse_vnop_advlock(struct vop_advlock_args *ap) { struct vnode *vp = ap->a_vp; struct flock *fl = ap->a_fl; struct thread *td = curthread; struct ucred *cred = td->td_ucred; pid_t pid = td->td_proc->p_pid; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct fuse_lk_in *fli; struct fuse_lk_out *flo; struct vattr vattr; enum fuse_opcode op; off_t size, start; int dataflags, err; int flags = ap->a_flags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; if (fuse_isdeadfs(vp)) { return ENXIO; } switch(ap->a_op) { case F_GETLK: op = FUSE_GETLK; break; case F_SETLK: if (flags & F_WAIT) op = FUSE_SETLKW; else op = FUSE_SETLK; break; case F_UNLCK: op = FUSE_SETLK; break; default: return EINVAL; } if (!(dataflags & FSESS_POSIX_LOCKS)) return vop_stdadvlock(ap); /* FUSE doesn't properly support flock until protocol 7.17 */ if (flags & F_FLOCK) return vop_stdadvlock(ap); vn_lock(vp, LK_SHARED | LK_RETRY); switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: /* * Caller is responsible for adding any necessary offset * when SEEK_CUR is used. */ start = fl->l_start; break; case SEEK_END: err = fuse_internal_getattr(vp, &vattr, cred, td); if (err) goto out; size = vattr.va_size; if (size > OFF_MAX || (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) { err = EOVERFLOW; goto out; } start = size + fl->l_start; break; default: return (EINVAL); } err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); if (err) goto out; fdisp_init(&fdi, sizeof(*fli)); fdisp_make_vp(&fdi, op, vp, td, cred); fli = fdi.indata; fli->fh = fufh->fh_id; fli->owner = td->td_proc->p_pid; fli->lk.start = start; if (fl->l_len != 0) fli->lk.end = start + fl->l_len - 1; else fli->lk.end = INT64_MAX; fli->lk.type = fl->l_type; fli->lk.pid = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); if (err == 0 && op == FUSE_GETLK) { flo = fdi.answ; fl->l_type = flo->lk.type; fl->l_whence = SEEK_SET; if (flo->lk.type != F_UNLCK) { fl->l_pid = flo->lk.pid; fl->l_start = flo->lk.start; if (flo->lk.end == INT64_MAX) fl->l_len = 0; else fl->l_len = flo->lk.end - flo->lk.start + 1; fl->l_start = flo->lk.start; } } out: VOP_UNLOCK(vp); return err; } static int fuse_vnop_allocate(struct vop_allocate_args *ap) { struct vnode *vp = ap->a_vp; off_t *len = ap->a_len; off_t *offset = ap->a_offset; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh; struct mount *mp = vnode_mount(vp); struct fuse_dispatcher fdi; struct fuse_fallocate_in *ffi; struct uio io; pid_t pid = curthread->td_proc->p_pid; struct fuse_vnode_data *fvdat = VTOFUD(vp); off_t filesize; int err; if (fuse_isdeadfs(vp)) return (ENXIO); switch (vp->v_type) { case VFIFO: return (ESPIPE); case VLNK: case VREG: if (vfs_isrdonly(mp)) return (EROFS); break; default: return (ENODEV); } if (vfs_isrdonly(mp)) return (EROFS); if (fsess_not_impl(mp, FUSE_FALLOCATE)) return (EINVAL); io.uio_offset = *offset; io.uio_resid = *len; err = vn_rlimit_fsize(vp, &io, curthread); if (err) return (err); err = fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (err) return (err); fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) return (err); fuse_inval_buf_range(vp, filesize, *offset, *offset + *len); fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FALLOCATE, vp, curthread, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; ffi->offset = *offset; ffi->length = *len; ffi->mode = 0; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FALLOCATE); err = EINVAL; } else if (err == EOPNOTSUPP) { /* * The file system server does not support FUSE_FALLOCATE with * the supplied mode for this particular file. */ err = EINVAL; } else if (!err) { *offset += *len; *len = 0; fuse_vnode_undirty_cached_timestamps(vp, false); fuse_internal_clear_suid_on_write(vp, cred, curthread); if (*offset > fvdat->cached_attrs.va_size) { fuse_vnode_setsize(vp, *offset, false); getnanouptime(&fvdat->last_local_modify); } } fdisp_destroy(&fdi); return (err); } /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ static int fuse_vnop_bmap(struct vop_bmap_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj **bo = ap->a_bop; struct thread *td = curthread; struct mount *mp; struct fuse_dispatcher fdi; struct fuse_bmap_in *fbi; struct fuse_bmap_out *fbo; struct fuse_data *data; struct fuse_vnode_data *fvdat = VTOFUD(vp); uint64_t biosize; off_t fsize; daddr_t lbn = ap->a_bn; daddr_t *pbn = ap->a_bnp; int *runp = ap->a_runp; int *runb = ap->a_runb; int error = 0; int maxrun; if (fuse_isdeadfs(vp)) { return ENXIO; } mp = vnode_mount(vp); data = fuse_get_mpdata(mp); biosize = fuse_iosize(vp); maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1, data->max_readahead_blocks); if (bo != NULL) *bo = &vp->v_bufobj; /* * The FUSE_BMAP operation does not include the runp and runb * variables, so we must guess. Report nonzero contiguous runs so * cluster_read will combine adjacent reads. It's worthwhile to reduce * upcalls even if we don't know the true physical layout of the file. * * FUSE file systems may opt out of read clustering in two ways: * * mounting with -onoclusterr * * Setting max_readahead <= maxbcachebuf during FUSE_INIT */ if (runb != NULL) *runb = MIN(lbn, maxrun); if (runp != NULL && maxrun == 0) *runp = 0; else if (runp != NULL) { /* * If the file's size is cached, use that value to calculate * runp, even if the cache is expired. runp is only advisory, * and the risk of getting it wrong is not worth the cost of * another upcall. */ if (fvdat->cached_attrs.va_size != VNOVAL) fsize = fvdat->cached_attrs.va_size; else error = fuse_vnode_size(vp, &fsize, td->td_ucred, td); if (error == 0) *runp = MIN(MAX(0, fsize / (off_t)biosize - lbn - 1), maxrun); else *runp = 0; } if (fsess_maybe_impl(mp, FUSE_BMAP)) { fdisp_init(&fdi, sizeof(*fbi)); fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred); fbi = fdi.indata; fbi->block = lbn; fbi->blocksize = biosize; error = fdisp_wait_answ(&fdi); if (error == ENOSYS) { fdisp_destroy(&fdi); fsess_set_notimpl(mp, FUSE_BMAP); error = 0; } else { fbo = fdi.answ; if (error == 0 && pbn != NULL) *pbn = fbo->block; fdisp_destroy(&fdi); return error; } } /* If the daemon doesn't support BMAP, make up a sensible default */ if (pbn != NULL) *pbn = lbn * btodb(biosize); return (error); } /* struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp = vnode_mount(vp); struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; struct thread *td = ap->a_td; pid_t pid = td->td_proc->p_pid; struct fuse_vnode_data *fvdat = VTOFUD(vp); int err = 0; if (fuse_isdeadfs(vp)) return 0; if (vnode_isdir(vp)) return 0; if (fflag & IO_NDELAY) return 0; err = fuse_flush(vp, cred, pid, fflag); if (err == 0 && (fvdat->flag & FN_ATIMECHANGE) && !vfs_isrdonly(mp)) { struct vattr vap; struct fuse_data *data; int dataflags; int access_e = 0; data = fuse_get_mpdata(mp); dataflags = data->dataflags; if (dataflags & FSESS_DEFAULT_PERMISSIONS) { struct vattr va; fuse_internal_getattr(vp, &va, cred, td); access_e = vaccess(vp->v_type, va.va_mode, va.va_uid, va.va_gid, VWRITE, cred); } if (access_e == 0) { VATTR_NULL(&vap); vap.va_atime = fvdat->cached_attrs.va_atime; /* * Ignore errors setting when setting atime. That * should not cause close(2) to fail. */ fuse_internal_setattr(vp, &vap, td, NULL); } } /* TODO: close the file handle, if we're sure it's no longer used */ if ((fvdat->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, cred, td->td_proc->p_pid); } return err; } /* struct vop_copy_file_range_args { struct vop_generic_args a_gen; struct vnode *a_invp; off_t *a_inoffp; struct vnode *a_outvp; off_t *a_outoffp; size_t *a_lenp; unsigned int a_flags; struct ucred *a_incred; struct ucred *a_outcred; struct thread *a_fsizetd; } */ static int fuse_vnop_copy_file_range(struct vop_copy_file_range_args *ap) { struct vnode *invp = ap->a_invp; struct vnode *outvp = ap->a_outvp; struct mount *mp = vnode_mount(invp); struct fuse_vnode_data *outfvdat = VTOFUD(outvp); struct fuse_dispatcher fdi; struct fuse_filehandle *infufh, *outfufh; struct fuse_copy_file_range_in *fcfri; struct ucred *incred = ap->a_incred; struct ucred *outcred = ap->a_outcred; struct fuse_write_out *fwo; struct thread *td; struct uio io; off_t outfilesize; ssize_t r = 0; pid_t pid; int err; err = ENOSYS; if (mp == NULL || mp != vnode_mount(outvp)) goto fallback; if (incred->cr_uid != outcred->cr_uid) goto fallback; if (incred->cr_groups[0] != outcred->cr_groups[0]) goto fallback; /* Caller busied mp, mnt_data can be safely accessed. */ if (fsess_not_impl(mp, FUSE_COPY_FILE_RANGE)) goto fallback; if (ap->a_fsizetd == NULL) td = curthread; else td = ap->a_fsizetd; pid = td->td_proc->p_pid; vn_lock_pair(invp, false, LK_SHARED, outvp, false, LK_EXCLUSIVE); if (invp->v_data == NULL || outvp->v_data == NULL) { err = EBADF; goto unlock; } err = fuse_filehandle_getrw(invp, FREAD, &infufh, incred, pid); if (err) goto unlock; err = fuse_filehandle_getrw(outvp, FWRITE, &outfufh, outcred, pid); if (err) goto unlock; io.uio_resid = *ap->a_lenp; if (ap->a_fsizetd) { io.uio_offset = *ap->a_outoffp; err = vn_rlimit_fsizex(outvp, &io, 0, &r, ap->a_fsizetd); if (err != 0) goto unlock; } err = fuse_vnode_size(outvp, &outfilesize, outcred, curthread); if (err) goto unlock; vnode_pager_clean_sync(invp); err = fuse_inval_buf_range(outvp, outfilesize, *ap->a_outoffp, *ap->a_outoffp + io.uio_resid); if (err) goto unlock; fdisp_init(&fdi, sizeof(*fcfri)); fdisp_make_vp(&fdi, FUSE_COPY_FILE_RANGE, invp, td, incred); fcfri = fdi.indata; fcfri->fh_in = infufh->fh_id; fcfri->off_in = *ap->a_inoffp; fcfri->nodeid_out = VTOI(outvp); fcfri->fh_out = outfufh->fh_id; fcfri->off_out = *ap->a_outoffp; fcfri->len = io.uio_resid; fcfri->flags = 0; err = fdisp_wait_answ(&fdi); if (err == 0) { fwo = fdi.answ; *ap->a_lenp = fwo->size; *ap->a_inoffp += fwo->size; *ap->a_outoffp += fwo->size; fuse_internal_clear_suid_on_write(outvp, outcred, td); if (*ap->a_outoffp > outfvdat->cached_attrs.va_size) { fuse_vnode_setsize(outvp, *ap->a_outoffp, false); getnanouptime(&outfvdat->last_local_modify); } fuse_vnode_update(invp, FN_ATIMECHANGE); fuse_vnode_update(outvp, FN_MTIMECHANGE | FN_CTIMECHANGE); } fdisp_destroy(&fdi); unlock: if (invp != outvp) VOP_UNLOCK(invp); VOP_UNLOCK(outvp); if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_COPY_FILE_RANGE); fallback: /* * No need to call vn_rlimit_fsizex_res before return, since the uio is * local. */ return (err); } static void fdisp_make_mknod_for_fallback( struct fuse_dispatcher *fdip, struct componentname *cnp, struct vnode *dvp, uint64_t parentnid, struct thread *td, struct ucred *cred, mode_t mode, enum fuse_opcode *op) { struct fuse_mknod_in *fmni; fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1); *op = FUSE_MKNOD; fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred); fmni = fdip->indata; fmni->mode = mode; fmni->rdev = 0; memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0'; } /* struct vnop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct thread *td = curthread; struct ucred *cred = cnp->cn_cred; struct fuse_data *data; struct fuse_create_in *fci; struct fuse_entry_out *feo; struct fuse_open_out *foo; struct fuse_dispatcher fdi, fdi2; struct fuse_dispatcher *fdip = &fdi; struct fuse_dispatcher *fdip2 = NULL; int err; struct mount *mp = vnode_mount(dvp); data = fuse_get_mpdata(mp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); enum fuse_opcode op; int flags; if (fuse_isdeadfs(dvp)) return ENXIO; /* FUSE expects sockets to be created with FUSE_MKNOD */ if (vap->va_type == VSOCK) return fuse_internal_mknod(dvp, vpp, cnp, vap); /* * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a * writable mode makes sense, and we might as well include readability * too. */ flags = O_RDWR; bzero(&fdi, sizeof(fdi)); if (vap->va_type != VREG) return (EINVAL); if (fsess_not_impl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { /* Fallback to FUSE_MKNOD/FUSE_OPEN */ fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); } else { /* Use FUSE_CREATE */ size_t insize; op = FUSE_CREATE; fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1); fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred); fci = fdip->indata; fci->mode = mode; fci->flags = O_CREAT | flags; if (fuse_libabi_geq(data, 7, 12)) { insize = sizeof(*fci); fci->umask = td->td_proc->p_pd->pd_cmask; } else { insize = sizeof(struct fuse_open_in); } memcpy((char *)fdip->indata + insize, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0'; } err = fdisp_wait_answ(fdip); if (err) { if (err == ENOSYS && op == FUSE_CREATE) { fsess_set_notimpl(mp, FUSE_CREATE); fdisp_destroy(fdip); fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); err = fdisp_wait_answ(fdip); } if (err) goto out; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, vap->va_type))) { goto out; } if (op == FUSE_CREATE) { if (fuse_libabi_geq(data, 7, 9)) foo = (struct fuse_open_out*)(feo + 1); else foo = (struct fuse_open_out*)((char*)feo + FUSE_COMPAT_ENTRY_OUT_SIZE); } else { /* Issue a separate FUSE_OPEN */ struct fuse_open_in *foi; fdip2 = &fdi2; fdisp_init(fdip2, sizeof(*foi)); fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td, cred); foi = fdip2->indata; foi->flags = flags; err = fdisp_wait_answ(fdip2); if (err) goto out; foo = fdip2->answ; } err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; uint64_t fh_id = foo->fh; fdisp_destroy(fdip); fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; fri->flags = flags; fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); fuse_insert_message(fdip->tick, false); goto out; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL, true); fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo); fuse_vnode_open(*vpp, foo->open_flags, td); /* * Purge the parent's attribute cache because the daemon should've * updated its mtime and ctime */ fuse_vnode_clear_attr_cache(dvp); cache_purge_negative(dvp); out: if (fdip2) fdisp_destroy(fdip2); fdisp_destroy(fdip); return err; } /* struct vnop_fdatasync_args { struct vop_generic_args a_gen; struct vnode * a_vp; struct thread * a_td; }; */ static int fuse_vnop_fdatasync(struct vop_fdatasync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = MNT_WAIT; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfdatasync_buf(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, true); } /* struct vnop_fsync_args { struct vop_generic_args a_gen; struct vnode * a_vp; int a_waitfor; struct thread * a_td; }; */ static int fuse_vnop_fsync(struct vop_fsync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = ap->a_waitfor; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, false); } /* struct vnop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; int err = 0; int dataflags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; /* Note that we are not bailing out on a dead file system just yet. */ if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); err = ENOTCONN; return err; } else { goto fake; } } err = fuse_internal_getattr(vp, vap, cred, td); if (err == ENOTCONN && vnode_isvroot(vp)) { /* see comment in fuse_vfsop_statfs() */ goto fake; } else { return err; } fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); return 0; } /* struct vnop_inactive_args { struct vnode *a_vp; }; */ static int fuse_vnop_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; int need_flush = 1; LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { if (need_flush && vp->v_type == VREG) { if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, NULL, 0); } if ((fvdat->flag & FN_REVOKED) != 0) fuse_io_invalbuf(vp, td); else fuse_io_flushbuf(vp, MNT_WAIT, td); need_flush = 0; } fuse_filehandle_close(vp, fufh, td, NULL); } if ((fvdat->flag & FN_REVOKED) != 0) vrecycle(vp); return 0; } /* struct vnop_ioctl_args { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp = vnode_mount(vp); struct ucred *cred = ap->a_cred; off_t *offp; pid_t pid = ap->a_td->td_proc->p_pid; int err; switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: /* Call FUSE_LSEEK, if we can, or fall back to vop_stdioctl */ if (fsess_maybe_impl(mp, FUSE_LSEEK)) { int whence; offp = ap->a_data; if (ap->a_command == FIOSEEKDATA) whence = SEEK_DATA; else whence = SEEK_HOLE; vn_lock(vp, LK_SHARED | LK_RETRY); err = fuse_vnop_do_lseek(vp, ap->a_td, cred, pid, offp, whence); VOP_UNLOCK(vp); } if (fsess_not_impl(mp, FUSE_LSEEK)) err = vop_stdioctl(ap); break; default: /* TODO: implement FUSE_IOCTL */ err = ENOTTY; break; } return (err); } /* struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = VTOVA(vp); struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_link_in fli; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_mount(tdvp) != vnode_mount(vp)) { return EXDEV; } /* * This is a seatbelt check to protect naive userspace filesystems from * themselves and the limitations of the FUSE IPC protocol. If a * filesystem does not allow attribute caching, assume it is capable of * validating that nlink does not overflow. */ if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX) return EMLINK; fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp, FUSE_LINK, &fli, sizeof(fli), &fdi); if ((err = fdisp_wait_answ(&fdi))) { goto out; } feo = fdi.answ; if (fli.oldnodeid != feo->nodeid) { struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); fuse_warn(data, FSESS_WARN_ILLEGAL_INODE, "Assigned wrong inode for a hard link."); fuse_vnode_clear_attr_cache(vp); fuse_vnode_clear_attr_cache(tdvp); err = EIO; goto out; } err = fuse_internal_checkentry(feo, vnode_vtype(vp)); if (!err) { /* * Purge the parent's attribute cache because the daemon * should've updated its mtime and ctime */ fuse_vnode_clear_attr_cache(tdvp); fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL, true); } out: fdisp_destroy(&fdi); return err; } struct fuse_lookup_alloc_arg { struct fuse_entry_out *feo; struct componentname *cnp; uint64_t nid; __enum_uint8(vtype) vtyp; }; /* Callback for vn_get_ino */ static int fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { struct fuse_lookup_alloc_arg *flaa = arg; return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp, flaa->vtyp); } SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup, "int", "struct timespec*", "struct timespec*"); /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; */ int fuse_vnop_lookup(struct vop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct thread *td = curthread; struct ucred *cred = cnp->cn_cred; struct timespec now; int nameiop = cnp->cn_nameiop; - int flags = cnp->cn_flags; - int islastcn = flags & ISLASTCN; + bool isdotdot = cnp->cn_flags & ISDOTDOT; + bool islastcn = cnp->cn_flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); struct fuse_data *data = fuse_get_mpdata(mp); int default_permissions = data->dataflags & FSESS_DEFAULT_PERMISSIONS; bool is_dot; int err = 0; int lookup_err = 0; struct vnode *vp = NULL; struct fuse_dispatcher fdi; bool did_lookup = false; struct fuse_entry_out *feo = NULL; __enum_uint8(vtype) vtyp; /* vnode type of target */ uint64_t nid; if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } if (!vnode_isdir(dvp)) return ENOTDIR; if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) return EROFS; if ((cnp->cn_flags & NOEXECCHECK) != 0) cnp->cn_flags &= ~NOEXECCHECK; else if ((err = fuse_internal_access(dvp, VEXEC, td, cred))) return err; is_dot = cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.'; - if ((flags & ISDOTDOT) && !(data->dataflags & FSESS_EXPORT_SUPPORT)) - { + if (isdotdot && !(data->dataflags & FSESS_EXPORT_SUPPORT)) { if (!(VTOFUD(dvp)->flag & FN_PARENT_NID)) { /* * Since the file system doesn't support ".." lookups, * we have no way to find this entry. */ return ESTALE; } nid = VTOFUD(dvp)->parent_nid; if (nid == 0) return ENOENT; /* .. is obviously a directory */ vtyp = VDIR; } else if (is_dot) { nid = VTOI(dvp); /* . is obviously a directory */ vtyp = VDIR; } else { struct timespec timeout; int ncpticks; /* here to accommodate for API contract */ err = cache_lookup(dvp, vpp, cnp, &timeout, &ncpticks); getnanouptime(&now); SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now); switch (err) { case -1: /* positive match */ if (timespeccmp(&timeout, &now, >)) { counter_u64_add(fuse_lookup_cache_hits, 1); } else { /* Cache timeout */ counter_u64_add(fuse_lookup_cache_misses, 1); bintime_clear( &VTOFUD(*vpp)->entry_cache_timeout); cache_purge(*vpp); if (dvp != *vpp) vput(*vpp); else vrele(*vpp); *vpp = NULL; break; } return 0; case 0: /* no match in cache */ counter_u64_add(fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ if (timespeccmp(&timeout, &now, <=)) { /* Cache timeout */ cache_purge_negative(dvp); break; } /* fall through */ default: return err; } fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make(&fdi, FUSE_LOOKUP, mp, VTOI(dvp), td, cred); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; lookup_err = fdisp_wait_answ(&fdi); did_lookup = true; if (!lookup_err) { /* lookup call succeeded */ feo = (struct fuse_entry_out *)fdi.answ; nid = feo->nodeid; if (nid == 0) { /* zero nodeid means ENOENT and cache it */ struct timespec timeout; fdi.answ_stat = ENOENT; lookup_err = ENOENT; if (cnp->cn_flags & MAKEENTRY) { fuse_validity_2_timespec(feo, &timeout); /* Use the same entry_time for .. as for * the file itself. That doesn't honor * exactly what the fuse server tells * us, but to do otherwise would require * another cache lookup at this point. */ struct timespec *dtsp = NULL; cache_enter_time(dvp, *vpp, cnp, &timeout, dtsp); } } vtyp = IFTOVT(feo->attr.mode); } if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) { fdisp_destroy(&fdi); return lookup_err; } } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { /* Entry not found */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { if (default_permissions) err = fuse_internal_access(dvp, VWRITE, td, cred); else err = 0; if (!err) { err = EJUSTRETURN; } } else { err = ENOENT; } } else { /* Entry was found */ - if (flags & ISDOTDOT) { + if (isdotdot) { struct fuse_lookup_alloc_arg flaa; flaa.nid = nid; flaa.feo = feo; flaa.cnp = cnp; flaa.vtyp = vtyp; err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0, &vp); *vpp = vp; } else if (nid == VTOI(dvp)) { if (is_dot) { vref(dvp); *vpp = dvp; } else { fuse_warn(fuse_get_mpdata(mp), FSESS_WARN_ILLEGAL_INODE, "Assigned same inode to both parent and " "child."); err = EIO; } } else { struct fuse_vnode_data *fvdat; err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, &vp, cnp, vtyp); if (err) goto out; *vpp = vp; fvdat = VTOFUD(vp); MPASS(feo != NULL); if (timespeccmp(&now, &fvdat->last_local_modify, >)) { /* * Attributes from the server are definitely * newer than the last attributes we sent to * the server, so cache them. */ fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL, true); } fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec, &fvdat->entry_cache_timeout); if ((nameiop == DELETE || nameiop == RENAME) && islastcn && default_permissions) { struct vattr dvattr; err = fuse_internal_access(dvp, VWRITE, td, cred); if (err != 0) goto out; /* * if the parent's sticky bit is set, check * whether we're allowed to remove the file. * Need to figure out the vnode locking to make * this work. */ fuse_internal_getattr(dvp, &dvattr, cred, td); if ((dvattr.va_mode & S_ISTXT) && fuse_internal_access(dvp, VADMIN, td, cred) && fuse_internal_access(*vpp, VADMIN, td, cred)) { err = EPERM; goto out; } } } } out: if (err) { if (vp != NULL && dvp != vp) vput(vp); else if (vp != NULL) vrele(vp); *vpp = NULL; } if (did_lookup) fdisp_destroy(&fdi); return err; } /* struct vnop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct fuse_mkdir_in fmdi; if (fuse_isdeadfs(dvp)) { return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); fmdi.umask = curthread->td_proc->p_pd->pd_cmask; return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); } /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mknod(struct vop_mknod_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; if (fuse_isdeadfs(dvp)) return ENXIO; return fuse_internal_mknod(dvp, vpp, cnp, vap); } /* struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; / struct file *a_fp; }; */ static int fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; int a_mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; pid_t pid = td->td_proc->p_pid; if (fuse_isdeadfs(vp)) return ENXIO; if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) return (EOPNOTSUPP); if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) return EINVAL; if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { fuse_vnode_open(vp, 0, td); return 0; } return fuse_filehandle_open(vp, a_mode, NULL, td, cred); } static int fuse_vnop_pathconf(struct vop_pathconf_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp; struct fuse_filehandle *fufh; int err; bool closefufh = false; switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX); return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); case _PC_MIN_HOLE_SIZE: /* * The FUSE protocol provides no mechanism for a server to * report _PC_MIN_HOLE_SIZE. It's a protocol bug. Instead, * return EINVAL if the server does not support FUSE_LSEEK, or * 1 if it does. */ mp = vnode_mount(vp); if (!fsess_is_impl(mp, FUSE_LSEEK) && !fsess_not_impl(mp, FUSE_LSEEK)) { off_t offset = 0; /* * Issue a FUSE_LSEEK to find out if it's supported. * Use SEEK_DATA instead of SEEK_HOLE, because the * latter generally requires sequential scans of file * metadata, which can be slow. */ err = fuse_vnop_do_lseek(vp, curthread, curthread->td_ucred, curthread->td_proc->p_pid, &offset, SEEK_DATA); if (err == EBADF) { /* * pathconf() doesn't necessarily open the * file. So we may need to do it here. */ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, curthread->td_ucred); if (err == 0) { closefufh = true; err = fuse_vnop_do_lseek(vp, curthread, curthread->td_ucred, curthread->td_proc->p_pid, &offset, SEEK_DATA); } if (closefufh) fuse_filehandle_close(vp, fufh, curthread, curthread->td_ucred); } } if (fsess_is_impl(mp, FUSE_LSEEK)) { *ap->a_retval = 1; return (0); } else if (fsess_not_impl(mp, FUSE_LSEEK)) { /* FUSE_LSEEK is not implemented */ return (EINVAL); } else { return (err); } default: return (vop_stdpathconf(ap)); } } SDT_PROBE_DEFINE3(fusefs, , vnops, filehandles_closed, "struct vnode*", "struct uio*", "struct ucred*"); /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; struct fuse_filehandle *fufh; int err; bool closefufh = false, directio; MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } err = fuse_filehandle_getrw(vp, FREAD, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); closefufh = true; } if (err) { SDT_PROBE3(fusefs, , vnops, filehandles_closed, vp, uio, cred); return err; } /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); fuse_vnode_update(vp, FN_ATIMECHANGE); if (directio) { SDT_PROBE2(fusefs, , vnops, trace, 1, "direct read of vnode"); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { SDT_PROBE2(fusefs, , vnops, trace, 1, "buffered read of vnode"); err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, pid); } if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); } /* struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; uint64_t **a_cookies; }; */ static int fuse_vnop_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh = NULL; struct mount *mp = vnode_mount(vp); struct fuse_iov cookediov; int err = 0; uint64_t *cookies; ssize_t tresid; int ncookies; bool closefufh = false; pid_t pid = curthread->td_proc->p_pid; if (ap->a_eofflag) *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ (uio_resid(uio) < sizeof(struct dirent))) { return EINVAL; } tresid = uio->uio_resid; err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); if (err == EBADF && mp->mnt_flag & MNT_EXPORTED) { KASSERT(!fsess_is_impl(mp, FUSE_OPENDIR), ("FUSE file systems that implement " "FUSE_OPENDIR should not be exported")); /* * nfsd will do VOP_READDIR without first doing VOP_OPEN. We * must implicitly open the directory here. */ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); closefufh = true; } if (err) return (err); if (ap->a_ncookies != NULL) { ncookies = uio->uio_resid / (offsetof(struct dirent, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); err = fuse_internal_readdir(vp, uio, fufh, &cookediov, &ncookies, cookies); fiov_teardown(&cookediov); if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); if (ap->a_ncookies != NULL) { if (err == 0) { *ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (err == 0 && tresid == uio->uio_resid) *ap->a_eofflag = 1; return err; } /* struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; */ static int fuse_vnop_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_dispatcher fdi; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (!vnode_islnk(vp)) { return EINVAL; } fdisp_init(&fdi, 0); err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred); if (err) { goto out; } if (strnlen(fdi.answ, fdi.iosize) + 1 < fdi.iosize) { struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); fuse_warn(data, FSESS_WARN_READLINK_EMBEDDED_NUL, "Returned an embedded NUL from FUSE_READLINK."); err = EIO; goto out; } if (((char *)fdi.answ)[0] == '/' && fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) { char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname; err = uiomove(mpth, strlen(mpth), uio); } if (!err) { err = uiomove(fdi.answ, fdi.iosize, uio); } out: fdisp_destroy(&fdi); return err; } /* struct vnop_reclaim_args { struct vnode *a_vp; }; */ static int fuse_vnop_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = curthread; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; if (!fvdat) { panic("FUSE: no vnode data during recycling"); } LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { printf("FUSE: vnode being reclaimed with open fufh " "(type=%#x)", fufh->fufh_type); fuse_filehandle_close(vp, fufh, td, NULL); } if (VTOI(vp) == 1) { /* * Don't send FUSE_FORGET for the root inode, because * we never send FUSE_LOOKUP for it (see * fuse_vfsop_root) and we don't want the server to see * mismatched lookup counts. */ struct fuse_data *data; struct vnode *vroot; data = fuse_get_mpdata(vnode_mount(vp)); FUSE_LOCK(); vroot = data->vroot; data->vroot = NULL; FUSE_UNLOCK(); if (vroot) vrele(vroot); } else if (!fuse_isdeadfs(vp) && fvdat->nlookup > 0) { fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp), fvdat->nlookup); } cache_purge(vp); vfs_hash_remove(vp); fuse_vnode_destroy(vp); return 0; } /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_isdir(vp)) { return EPERM; } err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); return err; } /* struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; */ static int fuse_vnop_rename(struct vop_rename_args *ap) { struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; bool newparent = fdvp != tdvp; bool isdir = fvp->v_type == VDIR; int err = 0; if (fuse_isdeadfs(fdvp)) { return ENXIO; } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); err = EXDEV; goto out; } cache_purge(fvp); /* * FUSE library is expected to check if target directory is not * under the source directory in the file system tree. * Linux performs this check at VFS level. */ /* * If source is a directory, and it will get a new parent, user must * have write permission to it, so ".." can be modified. */ data = fuse_get_mpdata(vnode_mount(tdvp)); if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) { err = fuse_internal_access(fvp, VWRITE, curthread, tcnp->cn_cred); if (err) goto out; } sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { if (tdvp != fdvp) fuse_vnode_setparent(fvp, tdvp); if (tvp != NULL) fuse_vnode_setparent(tvp, NULL); } sx_unlock(&data->rename_lock); if (tvp != NULL && tvp != fvp) { cache_purge(tvp); } if (vnode_isdir(fvp)) { if (((tvp != NULL) && vnode_isdir(tvp)) || vnode_isdir(fvp)) { cache_purge(tdvp); } cache_purge(fdvp); } out: if (tdvp == tvp) { vrele(tdvp); } else { vput(tdvp); } if (tvp != NULL) { vput(tvp); } vrele(fdvp); vrele(fvp); return err; } /* struct vnop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } *ap; */ static int fuse_vnop_rmdir(struct vop_rmdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp) == VTOFUD(dvp)) { return EINVAL; } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); return err; } /* struct vnop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct mount *mp; struct fuse_data *data; struct vattr old_va; int dataflags; int err = 0, err2; accmode_t accmode = 0; bool checkperm; bool drop_suid = false; mp = vnode_mount(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vap->va_uid != (uid_t)VNOVAL) { if (checkperm) { /* Only root may change a file's owner */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chown */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_uid != old_va.va_uid) return err; drop_suid = true; } } accmode |= VADMIN; } if (vap->va_gid != (gid_t)VNOVAL) { if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN)) drop_suid = true; if (checkperm && !groupmember(vap->va_gid, cred)) { /* * Non-root users may only chgrp to one of their own * groups */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chgrp */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_gid != old_va.va_gid) return err; } } accmode |= VADMIN; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vfs_isrdonly(mp)) return (EROFS); err = vn_rlimit_trunc(vap->va_size, td); if (err) return (err); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } /* Don't set accmode. Permission to trunc is checked upstack */ } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vap->va_vaflags & VA_UTIMES_NULL) accmode |= VWRITE; else accmode |= VADMIN; } if (drop_suid) { if (vap->va_mode != (mode_t)VNOVAL) vap->va_mode &= ~(S_ISUID | S_ISGID); else { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID); } } if (vap->va_mode != (mode_t)VNOVAL) { /* Only root may set the sticky bit on non-directories */ if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT) && priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return EFTYPE; if (checkperm && (vap->va_mode & S_ISGID)) { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); if (!groupmember(old_va.va_gid, cred)) { err = priv_check_cred(cred, PRIV_VFS_SETGID); if (err) return (err); } } accmode |= VADMIN; } if (vfs_isrdonly(mp)) return EROFS; if (checkperm) { err = fuse_internal_access(vp, accmode, td, cred); } else { err = 0; } if (err) return err; else return fuse_internal_setattr(vp, vap, td, cred); } /* struct vnop_strategy_args { struct vnode *a_vp; struct buf *a_bp; }; */ static int fuse_vnop_strategy(struct vop_strategy_args *ap) { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; if (!vp || fuse_isdeadfs(vp)) { bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); return 0; } /* * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags. * fuse_io_strategy sets bp's error fields */ (void)fuse_io_strategy(vp, bp); return 0; } /* struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; */ static int fuse_vnop_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; const char *target = ap->a_target; struct fuse_dispatcher fdi; int err; size_t len; if (fuse_isdeadfs(dvp)) { return ENXIO; } /* * Unlike the other creator type calls, here we have to create a message * where the name of the new entry comes first, and the data describing * the entry comes second. * Hence we can't rely on our handy fuse_internal_newentry() routine, * but put together the message manually and just call the core part. */ len = strlen(target) + 1; fdisp_init(&fdi, len + cnp->cn_namelen + 1); fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len); err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi); fdisp_destroy(&fdi); return err; } /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; struct fuse_filehandle *fufh; int err; bool closefufh = false, directio; MPASS(vp->v_type == VREG || vp->v_type == VDIR); if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } err = fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, FWRITE, &fufh, curthread, cred); closefufh = true; } if (err) { SDT_PROBE3(fusefs, , vnops, filehandles_closed, vp, uio, cred); return err; } /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); if (directio) { off_t start, end, filesize; bool pages = (ioflag & IO_VMIO) != 0; SDT_PROBE2(fusefs, , vnops, trace, 1, "direct write of vnode"); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) goto out; start = uio->uio_offset; end = start + uio->uio_resid; if (!pages) { err = fuse_inval_buf_range(vp, filesize, start, end); if (err) goto out; } err = fuse_write_directbackend(vp, uio, cred, fufh, filesize, ioflag, pages); } else { SDT_PROBE2(fusefs, , vnops, trace, 1, "buffered write of vnode"); if (!fsess_opt_writeback(vnode_mount(vp))) ioflag |= IO_SYNC; err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, pid); } fuse_internal_clear_suid_on_write(vp, cred, uio->uio_td); out: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); } static daddr_t fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { const int biosize = fuse_iosize(vp); return (off / biosize); } static int fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *blksz) { off_t filesize; int err; const int biosize = fuse_iosize(vp); err = fuse_vnode_size(vp, &filesize, NULL, NULL); if (err) { /* This will turn into a SIGBUS */ return (EIO); } else if ((off_t)lbn * biosize >= filesize) { *blksz = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { *blksz = filesize - (off_t)lbn *biosize; } else { *blksz = biosize; } return (0); } /* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { struct vnode *vp = ap->a_vp; if (!fsess_opt_mmap(vnode_mount(vp))) { SDT_PROBE2(fusefs, , vnops, trace, 1, "called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz)); } static const char extattr_namespace_separator = '.'; /* struct vop_getextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getextattr(struct vop_getextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_getxattr_in *get_xattr_in; struct fuse_getxattr_out *get_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *attr_str; size_t len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_GETXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*get_xattr_in)); fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred); get_xattr_in = fdi.indata; /* * Check to see whether we're querying the available size or * issuing the actual request. If we pass in 0, we get back struct * fuse_getxattr_out. If we pass in a non-zero size, we get back * that much data, without the struct fuse_getxattr_out header. */ if (uio == NULL) get_xattr_in->size = 0; else get_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*get_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); err = EOPNOTSUPP; } goto out; } get_xattr_out = fdi.answ; if (ap->a_size != NULL) *ap->a_size = get_xattr_out->size; if (uio != NULL) err = uiomove(fdi.answ, fdi.iosize, uio); out: fdisp_destroy(&fdi); return (err); } /* struct vop_setextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setextattr(struct vop_setextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_setxattr_in *set_xattr_in; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; size_t struct_size = FUSE_COMPAT_SETXATTR_IN_SIZE; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_SETXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; /* Deleting xattrs must use VOP_DELETEEXTATTR instead */ if (ap->a_uio == NULL) { /* * If we got here as fallback from VOP_DELETEEXTATTR, then * return EOPNOTSUPP. */ if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) return (EOPNOTSUPP); else return (EINVAL); } err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; /* older FUSE servers use a smaller fuse_setxattr_in struct*/ if (fuse_libabi_geq(fuse_get_mpdata(mp), 7, 33)) struct_size = sizeof(*set_xattr_in); fdisp_init(&fdi, len + struct_size + uio->uio_resid); fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred); set_xattr_in = fdi.indata; set_xattr_in->size = uio->uio_resid; if (fuse_libabi_geq(fuse_get_mpdata(mp), 7, 33)) { set_xattr_in->setxattr_flags = 0; set_xattr_in->padding = 0; } attr_str = (char *)fdi.indata + struct_size; snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = uiomove((char *)fdi.indata + struct_size + len, uio->uio_resid, uio); if (err != 0) { goto out; } err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_SETXATTR); err = EOPNOTSUPP; } if (err == ERESTART) { /* Can't restart after calling uiomove */ err = EINTR; } out: fdisp_destroy(&fdi); return (err); } /* * The Linux / FUSE extended attribute list is simply a collection of * NUL-terminated strings. The FreeBSD extended attribute list is a single * byte length followed by a non-NUL terminated string. So, this allows * conversion of the Linux / FUSE format to the FreeBSD format in place. * Linux attribute names are reported with the namespace as a prefix (e.g. * "user.attribute_name"), but in FreeBSD they are reported without the * namespace prefix (e.g. "attribute_name"). So, we're going from: * * user.attr_name1\0user.attr_name2\0 * * to: * * attr_name1attr_name2 * * Where "" is a single byte number of characters in the attribute name. * * Args: * prefix - exattr namespace prefix string * list, list_len - input list with namespace prefixes * bsd_list, bsd_list_len - output list compatible with bsd vfs */ static int fuse_xattrlist_convert(char *prefix, const char *list, int list_len, char *bsd_list, int *bsd_list_len) { int len, pos, dist_to_next, prefix_len; pos = 0; *bsd_list_len = 0; prefix_len = strlen(prefix); while (pos < list_len && list[pos] != '\0') { dist_to_next = strlen(&list[pos]) + 1; if (bcmp(&list[pos], prefix, prefix_len) == 0 && list[pos + prefix_len] == extattr_namespace_separator) { len = dist_to_next - (prefix_len + sizeof(extattr_namespace_separator)) - 1; if (len >= EXTATTR_MAXNAMELEN) return (ENAMETOOLONG); bsd_list[*bsd_list_len] = len; memcpy(&bsd_list[*bsd_list_len + 1], &list[pos + prefix_len + sizeof(extattr_namespace_separator)], len); *bsd_list_len += len + 1; } pos += dist_to_next; } return (0); } /* * List extended attributes * * The FUSE_LISTXATTR operation is based on Linux's listxattr(2) syscall, which * has a number of differences compared to its FreeBSD equivalent, * extattr_list_file: * * - FUSE_LISTXATTR returns all extended attributes across all namespaces, * whereas listxattr(2) only returns attributes for a single namespace * - FUSE_LISTXATTR prepends each attribute name with "namespace." * - If the provided buffer is not large enough to hold the result, * FUSE_LISTXATTR should return ERANGE, whereas listxattr is expected to * return as many results as will fit. */ /* struct vop_listextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_listextattr(struct vop_listextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_listxattr_in *list_xattr_in; struct fuse_listxattr_out *list_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *bsd_list = NULL; char *linux_list; int bsd_list_len; int linux_list_len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_LISTXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* * Add space for a NUL and the period separator if enabled. * Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; fdisp_init(&fdi, sizeof(*list_xattr_in)); fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); /* * Retrieve Linux / FUSE compatible list size. */ list_xattr_in = fdi.indata; list_xattr_in->size = 0; err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); err = EOPNOTSUPP; } goto out; } list_xattr_out = fdi.answ; linux_list_len = list_xattr_out->size; if (linux_list_len == 0) { if (ap->a_size != NULL) *ap->a_size = linux_list_len; goto out; } /* * Retrieve Linux / FUSE compatible list values. */ fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); list_xattr_in = fdi.indata; list_xattr_in->size = linux_list_len; err = fdisp_wait_answ(&fdi); if (err == ERANGE) { /* * Race detected. The attribute list must've grown since the * first FUSE_LISTXATTR call. Start over. Go all the way back * to userland so we can process signals, if necessary, before * restarting. */ err = ERESTART; goto out; } else if (err != 0) goto out; linux_list = fdi.answ; /* FUSE doesn't allow the server to return more data than requested */ if (fdi.iosize > linux_list_len) { struct fuse_data *data = fuse_get_mpdata(mp); fuse_warn(data, FSESS_WARN_LSEXTATTR_LONG, "server returned " "more extended attribute data than requested; " "should've returned ERANGE instead."); } else { /* But returning less data is fine */ linux_list_len = fdi.iosize; } /* * Retrieve the BSD compatible list values. * The Linux / FUSE attribute list format isn't the same * as FreeBSD's format. So we need to transform it into * FreeBSD's format before giving it to the user. */ bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK); err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len, bsd_list, &bsd_list_len); if (err != 0) goto out; if (ap->a_size != NULL) *ap->a_size = bsd_list_len; if (uio != NULL) err = uiomove(bsd_list, bsd_list_len, uio); out: free(bsd_list, M_TEMP); fdisp_destroy(&fdi); return (err); } /* struct vop_deallocate_args { struct vop_generic_args a_gen; struct vnode *a_vp; off_t *a_offset; off_t *a_len; int a_flags; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_deallocate(struct vop_deallocate_args *ap) { struct vnode *vp = ap->a_vp; struct mount *mp = vnode_mount(vp); struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct fuse_fallocate_in *ffi; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; off_t *len = ap->a_len; off_t *offset = ap->a_offset; int ioflag = ap->a_ioflag; off_t filesize; int err; bool closefufh = false; if (fuse_isdeadfs(vp)) return (ENXIO); if (vfs_isrdonly(mp)) return (EROFS); if (fsess_not_impl(mp, FUSE_FALLOCATE)) goto fallback; err = fuse_filehandle_getrw(vp, FWRITE, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, FWRITE, &fufh, curthread, cred); closefufh = true; } if (err) return (err); fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) goto out; fuse_inval_buf_range(vp, filesize, *offset, *offset + *len); fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FALLOCATE, vp, curthread, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; ffi->offset = *offset; ffi->length = *len; /* * FreeBSD's fspacectl is equivalent to Linux's fallocate with * mode == FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE */ ffi->mode = FUSE_FALLOC_FL_PUNCH_HOLE | FUSE_FALLOC_FL_KEEP_SIZE; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fdisp_destroy(&fdi); fsess_set_notimpl(mp, FUSE_FALLOCATE); goto fallback; } else if (err == EOPNOTSUPP) { /* * The file system server does not support FUSE_FALLOCATE with * the supplied mode for this particular file. */ fdisp_destroy(&fdi); goto fallback; } else if (!err) { /* * Clip the returned offset to EoF. Do it here rather than * before FUSE_FALLOCATE just in case the kernel's cached file * size is out of date. Unfortunately, FUSE does not return * any information about filesize from that operation. */ *offset = MIN(*offset + *len, filesize); *len = 0; fuse_vnode_undirty_cached_timestamps(vp, false); fuse_internal_clear_suid_on_write(vp, cred, curthread); if (ioflag & IO_SYNC) err = fuse_internal_fsync(vp, curthread, MNT_WAIT, false); } fdisp_destroy(&fdi); out: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); fallback: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (vop_stddeallocate(ap)); } /* struct vop_deleteextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_dispatcher fdi; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (fsess_not_impl(mp, FUSE_REMOVEXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len); fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred); attr_str = fdi.indata; snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_REMOVEXATTR); err = EOPNOTSUPP; } fdisp_destroy(&fdi); return (err); } /* struct vnop_print_args { struct vnode *a_vp; }; */ static int fuse_vnop_print(struct vop_print_args *ap) { struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp); printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n", (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid, (uintmax_t)fvdat->nlookup, fvdat->flag); return 0; } /* * Get an NFS filehandle for a FUSE file. * * This will only work for FUSE file systems that guarantee the uniqueness of * nodeid:generation, which most don't. */ /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ static int fuse_vnop_vptofh(struct vop_vptofh_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp); _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid), "FUSE fid type is too big"); struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); struct vattr va; int err; if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) { /* NFS requires lookups for "." and ".." */ SDT_PROBE2(fusefs, , vnops, trace, 1, "VOP_VPTOFH without FUSE_EXPORT_SUPPORT"); return EOPNOTSUPP; } if ((mp->mnt_flag & MNT_EXPORTED) && fsess_is_impl(mp, FUSE_OPENDIR)) { /* * NFS is stateless, so nfsd must reopen a directory on every * call to VOP_READDIR, passing in the d_off field from the * final dirent of the previous invocation. But if the server * implements FUSE_OPENDIR, the FUSE protocol does not * guarantee that d_off will be valid after a directory is * closed and reopened. So prohibit exporting FUSE file * systems that implement FUSE_OPENDIR. * * But userspace NFS servers don't have this problem. */ SDT_PROBE2(fusefs, , vnops, trace, 1, "VOP_VPTOFH with FUSE_OPENDIR"); return EOPNOTSUPP; } err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); if (err) return err; /*ip = VTOI(ap->a_vp);*/ /*ufhp = (struct ufid *)ap->a_fhp;*/ fhp->len = sizeof(struct fuse_fid); fhp->nid = fvdat->nid; if (fvdat->generation <= UINT32_MAX) fhp->gen = fvdat->generation; else return EOVERFLOW; return (0); } diff --git a/sys/fs/p9fs/p9fs_vnops.c b/sys/fs/p9fs/p9fs_vnops.c index cfb50c704019..56bf766ef801 100644 --- a/sys/fs/p9fs/p9fs_vnops.c +++ b/sys/fs/p9fs/p9fs_vnops.c @@ -1,2230 +1,2230 @@ /* * Copyright (c) 2017-2020 Juniper Networks, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ /* This file contains VFS file ops for the 9P protocol. * This makes the upper layer of the p9fs driver. These functions interact * with the VFS layer and lower layer of p9fs driver which is 9Pnet. All * the user file operations are handled here. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* File permissions. */ #define IEXEC 0000100 /* Executable. */ #define IWRITE 0000200 /* Writeable. */ #define IREAD 0000400 /* Readable. */ #define ISVTX 0001000 /* Sticky bit. */ #define ISGID 0002000 /* Set-gid. */ #define ISUID 0004000 /* Set-uid. */ static MALLOC_DEFINE(M_P9UIOV, "uio", "UIOV structures for strategy in p9fs"); extern uma_zone_t p9fs_io_buffer_zone; extern uma_zone_t p9fs_getattr_zone; extern uma_zone_t p9fs_setattr_zone; extern uma_zone_t p9fs_pbuf_zone; /* For the root vnode's vnops. */ struct vop_vector p9fs_vnops; static uint32_t p9fs_unix2p9_mode(uint32_t mode); static void p9fs_itimes(struct vnode *vp) { struct p9fs_node *node; struct timespec ts; struct p9fs_inode *inode; node = P9FS_VTON(vp); inode = &node->inode; vfs_timestamp(&ts); inode->i_mtime = ts.tv_sec; } /* * Cleanup the p9fs node, the in memory representation of a vnode for p9fs. * The cleanup includes invalidating all cache entries for the vnode, * destroying the vobject, removing vnode from hashlist, removing p9fs node * from the list of session p9fs nodes, and disposing of the p9fs node. * Basically it is doing a reverse of what a create/vget does. */ void p9fs_cleanup(struct p9fs_node *np) { struct vnode *vp; struct p9fs_session *vses; if (np == NULL) return; vp = P9FS_NTOV(np); vses = np->p9fs_ses; /* Remove the vnode from hash list if vnode is not already deleted */ if ((np->flags & P9FS_NODE_DELETED) == 0) vfs_hash_remove(vp); P9FS_LOCK(vses); if ((np->flags & P9FS_NODE_IN_SESSION) != 0) { np->flags &= ~P9FS_NODE_IN_SESSION; STAILQ_REMOVE(&vses->virt_node_list, np, p9fs_node, p9fs_node_next); } else { P9FS_UNLOCK(vses); return; } P9FS_UNLOCK(vses); /* Invalidate all entries to a particular vnode. */ cache_purge(vp); /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); /* Remove all the FID */ p9fs_fid_remove_all(np, FALSE); /* Dispose all node knowledge.*/ p9fs_destroy_node(&np); } /* * Reclaim VOP is defined to be called for every vnode. This starts off * the cleanup by clunking(remove the fid on the server) and calls * p9fs_cleanup to free all the resources allocated for p9fs node. */ static int p9fs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct p9fs_node *np; vp = ap->a_vp; np = P9FS_VTON(vp); P9_DEBUG(VOPS, "%s: vp:%p node:%p\n", __func__, vp, np); p9fs_cleanup(np); return (0); } /* * recycle vnodes which are no longer referenced i.e, their usecount is zero */ static int p9fs_inactive(struct vop_inactive_args *ap) { struct vnode *vp; struct p9fs_node *np; vp = ap->a_vp; np = P9FS_VTON(vp); P9_DEBUG(VOPS, "%s: vp:%p node:%p file:%s\n", __func__, vp, np, np->inode.i_name); if (np->flags & P9FS_NODE_DELETED) vrecycle(vp); return (0); } struct p9fs_lookup_alloc_arg { struct componentname *cnp; struct p9fs_node *dnp; struct p9_fid *newfid; }; /* Callback for vn_get_ino */ static int p9fs_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { struct p9fs_lookup_alloc_arg *p9aa = arg; return (p9fs_vget_common(mp, NULL, p9aa->cnp->cn_lkflags, p9aa->dnp, p9aa->newfid, vpp, p9aa->cnp->cn_nameptr)); } /* * p9fs_lookup is called for every component name that is being searched for. * * I. If component is found on the server, we look for the in-memory * repesentation(vnode) of this component in namecache. * A. If the node is found in the namecache, we check is the vnode is still * valid. * 1. If it is still valid, return vnode. * 2. If it is not valid, we remove this vnode from the name cache and * create a new vnode for the component and return that vnode. * B. If the vnode is not found in the namecache, we look for it in the * hash list. * 1. If the vnode is in the hash list, we check if the vnode is still * valid. * a. If it is still valid, we add that vnode to the namecache for * future lookups and return the vnode. * b. If it is not valid, create a new vnode and p9fs node, * initialize them and return the vnode. * 2. If the vnode is not found in the hash list, we create a new vnode * and p9fs node, initialize them and return the vnode. * II. If the component is not found on the server, an error code is returned. * A. For the creation case, we return EJUSTRETURN so VFS can handle it. * B. For all other cases, ENOENT is returned. */ static int p9fs_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; struct vnode **vpp, *vp; struct componentname *cnp; struct p9fs_node *dnp; /*dir p9_node */ struct p9fs_node *np; struct p9fs_session *vses; struct mount *mp; /* Get the mount point */ struct p9_fid *dvfid, *newfid; + uint64_t flags; int error; struct vattr vattr; - int flags; char tmpchr; dvp = ap->a_dvp; vpp = ap->a_vpp; cnp = ap->a_cnp; dnp = P9FS_VTON(dvp); error = 0; flags = cnp->cn_flags; *vpp = NULLVP; if (dnp == NULL) return (ENOENT); if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { vref(dvp); *vpp = dvp; return (0); } vses = dnp->p9fs_ses; mp = vses->p9fs_mount; /* Do the cache part ourselves */ if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, curthread); if (error) return (error); /* Do the directory walk on host to check if file exist */ dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); if (error) return (error); /* * Save the character present at namelen in nameptr string and * null terminate the character to get the search name for p9_dir_walk * This is done to handle when lookup is for "a" and component * name contains a/b/c */ tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; cnp->cn_nameptr[cnp->cn_namelen] = '\0'; /* * If the client_walk fails, it means the file looking for doesnt exist. * Create the file is the flags are set or just return the error */ newfid = p9_client_walk(dvfid, 1, &cnp->cn_nameptr, 1, &error); cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; if (error != 0 || newfid == NULL) { /* Clunk the newfid if it is not NULL */ if (newfid != NULL) p9_client_clunk(newfid); if (error != ENOENT) return (error); /* The requested file was not found. */ if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN)) { if (mp->mnt_flag & MNT_RDONLY) return (EROFS); error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, curthread); if (!error) { return (EJUSTRETURN); } } return (error); } /* Look for the entry in the component cache*/ error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error > 0 && error != ENOENT) { P9_DEBUG(VOPS, "%s: Cache lookup error %d \n", __func__, error); goto out; } if (error == -1) { vp = *vpp; /* Check if the entry in cache is stale or not */ if ((p9fs_node_cmp(vp, &newfid->qid) == 0) && ((error = VOP_GETATTR(vp, &vattr, cnp->cn_cred)) == 0)) { goto out; } /* * This case, we have an error coming from getattr, * act accordingly. */ cache_purge(vp); if (dvp != vp) vput(vp); else vrele(vp); *vpp = NULLVP; } else if (error == ENOENT) { if (VN_IS_DOOMED(dvp)) goto out; if (VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0) { error = ENOENT; goto out; } cache_purge_negative(dvp); } /* Reset values */ error = 0; vp = NULLVP; tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; cnp->cn_nameptr[cnp->cn_namelen] = '\0'; /* * Looks like we have found an entry. Now take care of all other cases. */ if (flags & ISDOTDOT) { struct p9fs_lookup_alloc_arg p9aa; p9aa.cnp = cnp; p9aa.dnp = dnp; p9aa.newfid = newfid; error = vn_vget_ino_gen(dvp, p9fs_lookup_alloc, &p9aa, 0, &vp); if (error) goto out; *vpp = vp; } else { /* * client_walk is equivalent to searching a component name in a * directory(fid) here. If new fid is returned, we have found an * entry for this component name so, go and create the rest of * the vnode infra(vget_common) for the returned newfid. */ if ((cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, curthread); if (error) goto out; error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, dnp, newfid, &vp, cnp->cn_nameptr); if (error) goto out; *vpp = vp; np = P9FS_VTON(vp); if ((dnp->inode.i_mode & ISVTX) && cnp->cn_cred->cr_uid != 0 && cnp->cn_cred->cr_uid != dnp->inode.n_uid && cnp->cn_cred->cr_uid != np->inode.n_uid) { vput(*vpp); *vpp = NULL; cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; return (EPERM); } } else { error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, dnp, newfid, &vp, cnp->cn_nameptr); if (error) goto out; *vpp = vp; } } cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; /* Store the result the cache if MAKEENTRY is specified in flags */ if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, *vpp, cnp); return (error); out: cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; p9_client_clunk(newfid); return (error); } /* * Common creation function for file/directory with respective flags. We first * open the parent directory in order to create the file under it. For this, * as 9P protocol suggests, we need to call client_walk to create the open fid. * Once we have the open fid, the file_create function creates the direntry with * the name and perm specified under the parent dir. If this succeeds (an entry * is created for the new file on the server), we create our metadata for this * file (vnode, p9fs node calling vget). Once we are done, we clunk the open * fid of the parent directory. */ static int create_common(struct p9fs_node *dnp, struct componentname *cnp, char *extension, uint32_t perm, uint8_t mode, struct vnode **vpp) { char tmpchr; struct p9_fid *dvfid, *ofid, *newfid; struct p9fs_session *vses; struct mount *mp; int error; P9_DEBUG(VOPS, "%s: name %s\n", __func__, cnp->cn_nameptr); vses = dnp->p9fs_ses; mp = vses->p9fs_mount; newfid = NULL; error = 0; dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); if (error != 0) return (error); /* Clone the directory fid to create the new file */ ofid = p9_client_walk(dvfid, 0, NULL, 1, &error); if (error != 0) return (error); /* * Save the character present at namelen in nameptr string and * null terminate the character to get the search name for p9_dir_walk */ tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; cnp->cn_nameptr[cnp->cn_namelen] = '\0'; error = p9_client_file_create(ofid, cnp->cn_nameptr, perm, mode, extension); if (error != 0) { P9_DEBUG(ERROR, "%s: p9_client_fcreate failed %d\n", __func__, error); goto out; } /* If its not hardlink only then do the walk, else we are done. */ if (!(perm & P9PROTO_DMLINK)) { /* * Do the lookup part and add the vnode, p9fs node. Note that vpp * is filled in here. */ newfid = p9_client_walk(dvfid, 1, &cnp->cn_nameptr, 1, &error); if (newfid != NULL) { error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, dnp, newfid, vpp, cnp->cn_nameptr); if (error != 0) goto out; } else { /* Not found return NOENTRY.*/ goto out; } if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(P9FS_NTOV(dnp), *vpp, cnp); } P9_DEBUG(VOPS, "%s: created file under vp %p node %p fid %ju\n", __func__, *vpp, dnp, (uintmax_t)dvfid->fid); /* Clunk the open ofid. */ if (ofid != NULL) (void)p9_client_clunk(ofid); cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; return (0); out: if (ofid != NULL) (void)p9_client_clunk(ofid); if (newfid != NULL) (void)p9_client_clunk(newfid); cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; return (error); } /* * This is the main file creation VOP. Make the permissions of the new * file and call the create_common common code to complete the create. */ static int p9fs_create(struct vop_create_args *ap) { struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; uint32_t mode; struct p9fs_node *dnp; struct p9fs_inode *dinode; uint32_t perm; int ret; dvp = ap->a_dvp; vpp = ap->a_vpp; cnp = ap->a_cnp; dnp = P9FS_VTON(dvp); dinode = &dnp->inode; mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); perm = p9fs_unix2p9_mode(mode); P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); ret = create_common(dnp, cnp, NULL, perm, P9PROTO_ORDWR, vpp); if (ret == 0) { P9FS_INCR_LINKS(dinode); } return (ret); } /* * p9fs_mkdir is the main directory creation vop. Make the permissions of the new dir * and call the create_common common code to complete the create. */ static int p9fs_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; uint32_t mode; struct p9fs_node *dnp; struct p9fs_inode *dinode; uint32_t perm; int ret; dvp = ap->a_dvp; vpp = ap->a_vpp; cnp = ap->a_cnp; dnp = P9FS_VTON(dvp); dinode = &dnp->inode; mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); perm = p9fs_unix2p9_mode(mode | S_IFDIR); P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); ret = create_common(dnp, cnp, NULL, perm, P9PROTO_ORDWR, vpp); if (ret == 0) P9FS_INCR_LINKS(dinode); return (ret); } /* * p9fs_mknod is the main node creation vop. Make the permissions of the new node * and call the create_common common code to complete the create. */ static int p9fs_mknod(struct vop_mknod_args *ap) { struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; uint32_t mode; struct p9fs_node *dnp; struct p9fs_inode *dinode; uint32_t perm; int ret; dvp = ap->a_dvp; vpp = ap->a_vpp; cnp = ap->a_cnp; dnp = P9FS_VTON(dvp); dinode = &dnp->inode; mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode); perm = p9fs_unix2p9_mode(mode); P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); ret = create_common(dnp, cnp, NULL, perm, P9PROTO_OREAD, vpp); if (ret == 0) { P9FS_INCR_LINKS(dinode); } return (ret); } /* Convert open mode permissions to P9 */ static int p9fs_uflags_mode(int uflags, int extended) { uint32_t ret; /* Convert first to O flags.*/ uflags = OFLAGS(uflags); switch (uflags & 3) { case O_RDONLY: ret = P9PROTO_OREAD; break; case O_WRONLY: ret = P9PROTO_OWRITE; break; case O_RDWR: ret = P9PROTO_ORDWR; break; } if (extended) { if (uflags & O_EXCL) ret |= P9PROTO_OEXCL; if (uflags & O_APPEND) ret |= P9PROTO_OAPPEND; } return (ret); } /* * This is the main open VOP for every file open. If the file is already * open, then increment and return. If there is no open fid for this file, * there needs to be a client_walk which creates a new open fid for this file. * Once we have a open fid, call the open on this file with the mode creating * the vobject. */ static int p9fs_open(struct vop_open_args *ap) { int error; struct vnode *vp; struct p9fs_node *np; struct p9fs_session *vses; struct p9_fid *vofid, *vfid; size_t filesize; uint32_t mode; error = 0; vp = ap->a_vp; np = P9FS_VTON(vp); vses = np->p9fs_ses; P9_DEBUG(VOPS, "%s: vp %p\n", __func__, vp); if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) return (EOPNOTSUPP); error = p9fs_reload_stats_dotl(vp, ap->a_cred); if (error != 0) return (error); ASSERT_VOP_LOCKED(vp, __func__); /* * Invalidate the pages of the vm_object cache if the file is modified * based on the flag set in reload stats */ if (vp->v_type == VREG && (np->flags & P9FS_NODE_MODIFIED) != 0) { error = vinvalbuf(vp, 0, 0, 0); if (error != 0) return (error); np->flags &= ~P9FS_NODE_MODIFIED; } vfid = p9fs_get_fid(vses->clnt, np, ap->a_cred, VFID, -1, &error); if (error != 0) return (error); /* * Translate kernel fflags to 9p mode */ mode = p9fs_uflags_mode(ap->a_mode, 1); /* * Search the fid in vofid_list for current user. If found increase the open * count and return. If not found clone a new fid and open the file using * that cloned fid. */ vofid = p9fs_get_fid(vses->clnt, np, ap->a_cred, VOFID, mode, &error); if (vofid != NULL) { vofid->v_opens++; return (0); } else { /*vofid is the open fid for this file.*/ vofid = p9_client_walk(vfid, 0, NULL, 1, &error); if (error != 0) return (error); } error = p9_client_open(vofid, mode); if (error != 0) p9_client_clunk(vofid); else { vofid->v_opens = 1; filesize = np->inode.i_size; vnode_create_vobject(vp, filesize, ap->a_td); p9fs_fid_add(np, vofid, VOFID); } return (error); } /* * Close the open references. Just reduce the open count on vofid and return. * Let clunking of VOFID happen in p9fs_reclaim. */ static int p9fs_close(struct vop_close_args *ap) { struct vnode *vp; struct p9fs_node *np; struct p9fs_session *vses; struct p9_fid *vofid; int error; vp = ap->a_vp; np = P9FS_VTON(vp); if (np == NULL) return (0); vses = np->p9fs_ses; error = 0; P9_DEBUG(VOPS, "%s: file_name %s\n", __func__, np->inode.i_name); /* * Translate kernel fflags to 9p mode */ vofid = p9fs_get_fid(vses->clnt, np, ap->a_cred, VOFID, p9fs_uflags_mode(ap->a_fflag, 1), &error); if (vofid == NULL) return (0); vofid->v_opens--; return (0); } /* Helper routine for checking if fileops are possible on this file */ static int p9fs_check_possible(struct vnode *vp, struct vattr *vap, mode_t mode) { /* Check if we are allowed to write */ switch (vap->va_type) { case VDIR: case VLNK: case VREG: /* * Normal nodes: check if we're on a read-only mounted * file system and bail out if we're trying to write. */ if ((mode & VMODIFY_PERMS) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); break; case VBLK: case VCHR: case VSOCK: case VFIFO: /* * Special nodes: even on read-only mounted file systems * these are allowed to be written to if permissions allow. */ break; default: /* No idea what this is */ return (EINVAL); } return (0); } /* Check the access permissions of the file. */ static int p9fs_access(struct vop_access_args *ap) { struct vnode *vp; accmode_t accmode; struct ucred *cred; struct vattr vap; int error; vp = ap->a_vp; accmode = ap->a_accmode; cred = ap->a_cred; P9_DEBUG(VOPS, "%s: vp %p\n", __func__, vp); /* make sure getattr is working correctly and is defined.*/ error = VOP_GETATTR(vp, &vap, cred); if (error != 0) return (error); error = p9fs_check_possible(vp, &vap, accmode); if (error != 0) return (error); /* Call the Generic Access check in VOPS*/ error = vaccess(vp->v_type, vap.va_mode, vap.va_uid, vap.va_gid, accmode, cred); return (error); } /* * Reload the file stats from the server and update the inode structure present * in p9fs node. */ int p9fs_reload_stats_dotl(struct vnode *vp, struct ucred *cred) { struct p9_stat_dotl *stat; int error; struct p9fs_node *node; struct p9fs_session *vses; struct p9_fid *vfid; error = 0; node = P9FS_VTON(vp); vses = node->p9fs_ses; vfid = p9fs_get_fid(vses->clnt, node, cred, VOFID, P9PROTO_OREAD, &error); if (vfid == NULL) { vfid = p9fs_get_fid(vses->clnt, node, cred, VFID, -1, &error); if (error) return (error); } stat = uma_zalloc(p9fs_getattr_zone, M_WAITOK | M_ZERO); error = p9_client_getattr(vfid, stat, P9PROTO_STATS_ALL); if (error != 0) { P9_DEBUG(ERROR, "%s: p9_client_getattr failed: %d\n", __func__, error); goto out; } /* Init the vnode with the disk info */ p9fs_stat_vnode_dotl(stat, vp); out: if (stat != NULL) { uma_zfree(p9fs_getattr_zone, stat); } return (error); } /* * Read the current inode values into the vap attr. We reload the stats from * the server. */ static int p9fs_getattr_dotl(struct vop_getattr_args *ap) { struct vnode *vp; struct vattr *vap; struct p9fs_node *node; struct p9fs_inode *inode; int error; vp = ap->a_vp; vap = ap->a_vap; node = P9FS_VTON(vp); if (node == NULL) return (ENOENT); inode = &node->inode; P9_DEBUG(VOPS, "%s: %u %u\n", __func__, inode->i_mode, IFTOVT(inode->i_mode)); /* Reload our stats once to get the right values.*/ error = p9fs_reload_stats_dotl(vp, ap->a_cred); if (error != 0) { P9_DEBUG(ERROR, "%s: failed: %d\n", __func__, error); return (error); } /* Basic info */ VATTR_NULL(vap); vap->va_atime.tv_sec = inode->i_atime; vap->va_mtime.tv_sec = inode->i_mtime; vap->va_ctime.tv_sec = inode->i_ctime; vap->va_atime.tv_nsec = inode->i_atime_nsec; vap->va_mtime.tv_nsec = inode->i_mtime_nsec; vap->va_ctime.tv_nsec = inode->i_ctime_nsec; vap->va_type = IFTOVT(inode->i_mode); vap->va_mode = inode->i_mode; vap->va_uid = inode->n_uid; vap->va_gid = inode->n_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_size = inode->i_size; vap->va_nlink = inode->i_links_count; vap->va_blocksize = inode->blksize; vap->va_fileid = inode->i_qid_path; vap->va_flags = inode->i_flags; vap->va_gen = inode->gen; vap->va_filerev = inode->data_version; vap->va_vaflags = 0; vap->va_bytes = inode->blocks * P9PROTO_TGETATTR_BLK; return (0); } /* Convert a standard FreeBSD permission to P9. */ static uint32_t p9fs_unix2p9_mode(uint32_t mode) { uint32_t res; res = mode & 0777; if (S_ISDIR(mode)) res |= P9PROTO_DMDIR; if (S_ISSOCK(mode)) res |= P9PROTO_DMSOCKET; if (S_ISLNK(mode)) res |= P9PROTO_DMSYMLINK; if (S_ISFIFO(mode)) res |= P9PROTO_DMNAMEDPIPE; if ((mode & S_ISUID) == S_ISUID) res |= P9PROTO_DMSETUID; if ((mode & S_ISGID) == S_ISGID) res |= P9PROTO_DMSETGID; if ((mode & S_ISVTX) == S_ISVTX) res |= P9PROTO_DMSETVTX; return (res); } /* Update inode with the stats read from server.(9P2000.L version) */ int p9fs_stat_vnode_dotl(struct p9_stat_dotl *stat, struct vnode *vp) { struct p9fs_node *np; struct p9fs_inode *inode; np = P9FS_VTON(vp); inode = &np->inode; ASSERT_VOP_LOCKED(vp, __func__); /* Update the pager size if file size changes on host */ if (inode->i_size != stat->st_size) { inode->i_size = stat->st_size; if (vp->v_type == VREG) vnode_pager_setsize(vp, inode->i_size); } inode->i_mtime = stat->st_mtime_sec; inode->i_atime = stat->st_atime_sec; inode->i_ctime = stat->st_ctime_sec; inode->i_mtime_nsec = stat->st_mtime_nsec; inode->i_atime_nsec = stat->st_atime_nsec; inode->i_ctime_nsec = stat->st_ctime_nsec; inode->n_uid = stat->st_uid; inode->n_gid = stat->st_gid; inode->i_mode = stat->st_mode; vp->v_type = IFTOVT(inode->i_mode); inode->i_links_count = stat->st_nlink; inode->blksize = stat->st_blksize; inode->blocks = stat->st_blocks; inode->gen = stat->st_gen; inode->data_version = stat->st_data_version; ASSERT_VOP_LOCKED(vp, __func__); /* Setting a flag if file changes based on qid version */ if (np->vqid.qid_version != stat->qid.version) np->flags |= P9FS_NODE_MODIFIED; memcpy(&np->vqid, &stat->qid, sizeof(stat->qid)); return (0); } /* * Write the current in memory inode stats into persistent stats structure * to write to the server(for linux version). */ static int p9fs_inode_to_iattr(struct p9fs_inode *inode, struct p9_iattr_dotl *p9attr) { p9attr->size = inode->i_size; p9attr->mode = inode->i_mode; p9attr->uid = inode->n_uid; p9attr->gid = inode->n_gid; p9attr->atime_sec = inode->i_atime; p9attr->atime_nsec = inode->i_atime_nsec; p9attr->mtime_sec = inode->i_mtime; p9attr->mtime_nsec = inode->i_mtime_nsec; return (0); } /* * Modify the ownership of a file whenever the chown is called on the * file. */ static int p9fs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *td) { struct p9fs_node *np; struct p9fs_inode *inode; uid_t ouid; gid_t ogid; int error; np = P9FS_VTON(vp); inode = &np->inode; if (uid == (uid_t)VNOVAL) uid = inode->n_uid; if (gid == (gid_t)VNOVAL) gid = inode->n_gid; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if (((uid != inode->n_uid && uid != cred->cr_uid) || (gid != inode->n_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) return (error); ogid = inode->n_gid; ouid = inode->n_uid; inode->n_gid = gid; inode->n_uid = uid; if ((inode->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) inode->i_mode &= ~(ISUID | ISGID); } P9_DEBUG(VOPS, "%s: vp %p, cred %p, td %p - ret OK\n", __func__, vp, cred, td); return (0); } /* * Update the in memory inode with all chmod new permissions/mode. Typically a * setattr is called to update it to server. */ static int p9fs_chmod(struct vnode *vp, uint32_t mode, struct ucred *cred, struct thread *td) { struct p9fs_node *np; struct p9fs_inode *inode; uint32_t nmode; int error; np = P9FS_VTON(vp); inode = &np->inode; P9_DEBUG(VOPS, "%s: vp %p, mode %x, cred %p, td %p\n", __func__, vp, mode, cred, td); /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. Both of these are allowed in * jail(8). */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return (EFTYPE); } if (!groupmember(inode->n_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error != 0) return (error); } /* * Deny setting setuid if we are not the file owner. */ if ((mode & ISUID) && inode->n_uid != cred->cr_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN); if (error != 0) return (error); } nmode = inode->i_mode; nmode &= ~ALLPERMS; nmode |= (mode & ALLPERMS); inode->i_mode = nmode; P9_DEBUG(VOPS, "%s: to mode %x %d \n ", __func__, nmode, error); return (error); } /* * Set the attributes of a file referenced by fid. A valid bitmask is sent * in request selecting which fields to set */ static int p9fs_setattr_dotl(struct vop_setattr_args *ap) { struct vnode *vp; struct vattr *vap; struct p9fs_node *node; struct p9fs_inode *inode; struct ucred *cred; struct thread *td; struct p9_iattr_dotl *p9attr; struct p9fs_session *vses; struct p9_fid *vfid; uint64_t oldfilesize; int error; vp = ap->a_vp; vap = ap->a_vap; node = P9FS_VTON(vp); inode = &node->inode; cred = ap->a_cred; td = curthread; vses = node->p9fs_ses; error = 0; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { P9_DEBUG(ERROR, "%s: unsettable attribute\n", __func__); return (EINVAL); } /* Disallow write attempts on read only filesystem */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* Setting of flags is not supported */ if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); /* Allocate p9attr struct */ p9attr = uma_zalloc(p9fs_setattr_zone, M_WAITOK | M_ZERO); if (p9attr == NULL) return (ENOMEM); /* Check if we need to change the ownership of the file*/ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { P9_DEBUG(VOPS, "%s: vp:%p td:%p uid/gid %x/%x\n", __func__, vp, td, vap->va_uid, vap->va_gid); error = p9fs_chown(vp, vap->va_uid, vap->va_gid, cred, td); p9attr->valid |= P9PROTO_SETATTR_UID | P9PROTO_SETATTR_GID | P9PROTO_SETATTR_MODE; if (error) goto out; } /* Check for mode changes */ if (vap->va_mode != (mode_t)VNOVAL) { P9_DEBUG(VOPS, "%s: vp:%p td:%p mode %x\n", __func__, vp, td, vap->va_mode); error = p9fs_chmod(vp, (int)vap->va_mode, cred, td); p9attr->valid |= P9PROTO_SETATTR_MODE; if (error) goto out; } /* Update the size of the file and update mtime */ if (vap->va_size != (uint64_t)VNOVAL) { P9_DEBUG(VOPS, "%s: vp:%p td:%p size:%jx\n", __func__, vp, td, (uintmax_t)vap->va_size); switch (vp->v_type) { case VDIR: error = EISDIR; goto out; case VLNK: case VREG: /* Invalidate cached pages of vp */ error = vinvalbuf(vp, 0, 0, 0); if (error) goto out; oldfilesize = inode->i_size; inode->i_size = vap->va_size; /* Update the p9fs_inode time */ p9fs_itimes(vp); p9attr->valid |= P9PROTO_SETATTR_SIZE | P9PROTO_SETATTR_ATIME | P9PROTO_SETATTR_MTIME | P9PROTO_SETATTR_ATIME_SET | P9PROTO_SETATTR_MTIME_SET ; break; default: goto out; } } else if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { P9_DEBUG(VOPS, "%s: vp:%p td:%p time a/m %jx/%jx/\n", __func__, vp, td, (uintmax_t)vap->va_atime.tv_sec, (uintmax_t)vap->va_mtime.tv_sec); /* Update the p9fs_inode times */ p9fs_itimes(vp); p9attr->valid |= P9PROTO_SETATTR_ATIME | P9PROTO_SETATTR_MTIME | P9PROTO_SETATTR_ATIME_SET | P9PROTO_SETATTR_MTIME_SET; } vfid = p9fs_get_fid(vses->clnt, node, cred, VOFID, P9PROTO_OWRITE, &error); if (vfid == NULL) { vfid = p9fs_get_fid(vses->clnt, node, cred, VFID, -1, &error); if (error) goto out; } /* Write the inode structure values into p9attr */ p9fs_inode_to_iattr(inode, p9attr); error = p9_client_setattr(vfid, p9attr); if (vap->va_size != (uint64_t)VNOVAL && vp->v_type == VREG) { if (error) inode->i_size = oldfilesize; else vnode_pager_setsize(vp, inode->i_size); } out: if (p9attr) { uma_zfree(p9fs_setattr_zone, p9attr); } P9_DEBUG(VOPS, "%s: error: %d\n", __func__, error); return (error); } struct open_fid_state { struct p9_fid *vofid; int fflags; int opened; }; /* * TODO: change this to take P9PROTO_* mode and avoid routing through * VOP_OPEN, factoring out implementation of p9fs_open. */ static int p9fs_get_open_fid(struct vnode *vp, int fflags, struct ucred *cr, struct open_fid_state *statep) { struct p9fs_node *np; struct p9fs_session *vses; struct p9_fid *vofid; int mode = p9fs_uflags_mode(fflags, TRUE); int error = 0; statep->opened = FALSE; np = P9FS_VTON(vp); vses = np->p9fs_ses; vofid = p9fs_get_fid(vses->clnt, np, cr, VOFID, mode, &error); if (vofid == NULL) { error = VOP_OPEN(vp, fflags, cr, curthread, NULL); if (error) { return (error); } vofid = p9fs_get_fid(vses->clnt, np, cr, VOFID, mode, &error); if (vofid == NULL) { return (EBADF); } statep->fflags = fflags; statep->opened = TRUE; } statep->vofid = vofid; return (0); } static void p9fs_release_open_fid(struct vnode *vp, struct ucred *cr, struct open_fid_state *statep) { if (statep->opened) { (void) VOP_CLOSE(vp, statep->fflags, cr, curthread); } } /* * An I/O buffer is used to to do any transfer. The uio is the vfs structure we * need to copy data into. As long as resid is greater than zero, we call * client_read to read data from offset(offset into the file) in the open fid * for the file into the I/O buffer. The data is read into the user data buffer. */ static int p9fs_read(struct vop_read_args *ap) { struct vnode *vp; struct uio *uio; struct p9fs_node *np; uint64_t offset; int64_t ret; uint64_t resid; uint32_t count; int error; char *io_buffer = NULL; uint64_t filesize; struct open_fid_state ostate; vp = ap->a_vp; uio = ap->a_uio; np = P9FS_VTON(vp); error = 0; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); if (vp->v_type != VREG) return (EISDIR); if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); error = p9fs_get_open_fid(vp, FREAD, ap->a_cred, &ostate); if (error) return (error); /* where in the file are we to start reading */ offset = uio->uio_offset; filesize = np->inode.i_size; if (uio->uio_offset >= filesize) goto out; P9_DEBUG(VOPS, "%s: called %jd at %ju\n", __func__, (intmax_t)uio->uio_resid, (uintmax_t)uio->uio_offset); /* Work with a local buffer from the pool for this vop */ io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK | M_ZERO); while ((resid = uio->uio_resid) > 0) { if (offset >= filesize) break; count = MIN(filesize - uio->uio_offset , resid); if (count == 0) break; /* Copy count bytes into the uio */ ret = p9_client_read(ostate.vofid, offset, count, io_buffer); /* * This is the only place in the entire p9fs where we check the * error for < 0 as p9_client_read/write return the number of * bytes instead of an error code. In this case if ret is < 0, * it means there is an IO error. */ if (ret < 0) { error = -ret; goto out; } error = uiomove(io_buffer, ret, uio); if (error != 0) goto out; offset += ret; } uio->uio_offset = offset; out: uma_zfree(p9fs_io_buffer_zone, io_buffer); p9fs_release_open_fid(vp, ap->a_cred, &ostate); return (error); } /* * The user buffer contains the data to be written. This data is copied first * from uio into I/O buffer. This I/O buffer is used to do the client_write to * the fid of the file starting from the offset given upto count bytes. The * number of bytes written is returned to the caller. */ static int p9fs_write(struct vop_write_args *ap) { struct vnode *vp; struct uio *uio; struct p9fs_node *np; uint64_t off, offset; int64_t ret; uint64_t resid, bytes_written; uint32_t count; int error, ioflag; uint64_t file_size; char *io_buffer = NULL; struct open_fid_state ostate; vp = ap->a_vp; uio = ap->a_uio; np = P9FS_VTON(vp); error = 0; ioflag = ap->a_ioflag; error = p9fs_get_open_fid(vp, FWRITE, ap->a_cred, &ostate); if (error) return (error); P9_DEBUG(VOPS, "%s: %#zx at %#jx\n", __func__, uio->uio_resid, (uintmax_t)uio->uio_offset); if (uio->uio_offset < 0) { error = EINVAL; goto out; } if (uio->uio_resid == 0) goto out; file_size = np->inode.i_size; switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = file_size; break; case VDIR: return (EISDIR); case VLNK: break; default: panic("%s: bad file type vp: %p", __func__, vp); } resid = uio->uio_resid; offset = uio->uio_offset; bytes_written = 0; error = 0; io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK | M_ZERO); while ((resid = uio->uio_resid) > 0) { off = 0; count = MIN(resid, P9FS_IOUNIT); error = uiomove(io_buffer, count, uio); if (error != 0) { P9_DEBUG(ERROR, "%s: uiomove failed: %d\n", __func__, error); goto out; } /* While count still exists, keep writing.*/ while (count > 0) { /* Copy count bytes from the uio */ ret = p9_client_write(ostate.vofid, offset, count, io_buffer + off); if (ret < 0) { if (bytes_written == 0) { error = -ret; goto out; } else { break; } } P9_DEBUG(VOPS, "%s: write %#zx at %#jx\n", __func__, uio->uio_resid, (uintmax_t)uio->uio_offset); off += ret; offset += ret; bytes_written += ret; count -= ret; } } /* Update the fields in the node to reflect the change*/ if (file_size < uio->uio_offset + uio->uio_resid) { np->inode.i_size = uio->uio_offset + uio->uio_resid; vnode_pager_setsize(vp, uio->uio_offset + uio->uio_resid); } out: if (io_buffer) uma_zfree(p9fs_io_buffer_zone, io_buffer); p9fs_release_open_fid(vp, ap->a_cred, &ostate); return (error); } /* * Common handler of all removal-related VOPs (e.g. rmdir, rm). Perform the * client_remove op to send messages to remove the node's fid on the server. * After that, does a node metadata cleanup on client side. */ static int remove_common(struct p9fs_node *dnp, struct p9fs_node *np, const char *name, struct ucred *cred) { int error; struct p9fs_session *vses; struct vnode *vp; struct p9_fid *vfid; error = 0; vses = np->p9fs_ses; vp = P9FS_NTOV(np); vfid = p9fs_get_fid(vses->clnt, dnp, cred, VFID, -1, &error); if (error != 0) return (error); error = p9_client_unlink(vfid, name, np->v_node->v_type == VDIR ? P9PROTO_UNLINKAT_REMOVEDIR : 0); if (error != 0) return (error); /* Remove all non-open fids associated with the vp */ if (np->inode.i_links_count == 1) p9fs_fid_remove_all(np, TRUE); /* Invalidate all entries of vnode from name cache and hash list. */ cache_purge(vp); vfs_hash_remove(vp); np->flags |= P9FS_NODE_DELETED; return (error); } /* Remove vop for all files. Call common code for remove and adjust links */ static int p9fs_remove(struct vop_remove_args *ap) { struct vnode *vp; struct p9fs_node *np; struct vnode *dvp; struct p9fs_node *dnp; struct p9fs_inode *dinode; struct componentname *cnp; int error; cnp = ap->a_cnp; vp = ap->a_vp; np = P9FS_VTON(vp); dvp = ap->a_dvp; dnp = P9FS_VTON(dvp); dinode = &dnp->inode; P9_DEBUG(VOPS, "%s: vp %p node %p \n", __func__, vp, np); if (vp->v_type == VDIR) return (EISDIR); error = remove_common(dnp, np, cnp->cn_nameptr, cnp->cn_cred); if (error == 0) P9FS_DECR_LINKS(dinode); return (error); } /* Remove vop for all directories. Call common code for remove and adjust links */ static int p9fs_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp; struct p9fs_node *np; struct vnode *dvp; struct p9fs_node *dnp; struct p9fs_inode *dinode; struct componentname *cnp; int error; cnp = ap->a_cnp; vp = ap->a_vp; np = P9FS_VTON(vp); dvp = ap->a_dvp; dnp = P9FS_VTON(dvp); dinode = &dnp->inode; P9_DEBUG(VOPS, "%s: vp %p node %p \n", __func__, vp, np); error = remove_common(dnp, np, cnp->cn_nameptr, cnp->cn_cred); if (error == 0) P9FS_DECR_LINKS(dinode); return (error); } /* * Create symlinks. Make the permissions and call create_common code * for Soft links. */ static int p9fs_symlink(struct vop_symlink_args *ap) { struct vnode *dvp; struct vnode **vpp; struct vattr *vap; struct componentname *cnp; char *symtgt; struct p9fs_node *dnp; struct p9fs_session *vses; struct mount *mp; struct p9_fid *dvfid, *newfid; int error; char tmpchr; gid_t gid; dvp = ap->a_dvp; vpp = ap->a_vpp; vap = ap->a_vap; cnp = ap->a_cnp; symtgt = (char*)(uintptr_t) ap->a_target; dnp = P9FS_VTON(dvp); vses = dnp->p9fs_ses; mp = vses->p9fs_mount; newfid = NULL; error = 0; gid = vap->va_gid; P9_DEBUG(VOPS, "%s: dvp %p\n", __func__, dvp); /* * Save the character present at namelen in nameptr string and * null terminate the character to get the search name for p9_dir_walk */ tmpchr = cnp->cn_nameptr[cnp->cn_namelen]; cnp->cn_nameptr[cnp->cn_namelen] = '\0'; dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); if (error != 0) goto out; error = p9_create_symlink(dvfid, cnp->cn_nameptr, symtgt, gid); if (error != 0) goto out; /*create vnode for symtgt */ newfid = p9_client_walk(dvfid, 1, &cnp->cn_nameptr, 1, &error); if (newfid != NULL) { error = p9fs_vget_common(mp, NULL, cnp->cn_lkflags, dnp, newfid, vpp, cnp->cn_nameptr); if (error != 0) goto out; } else goto out; if ((cnp->cn_flags & MAKEENTRY) != 0) { cache_enter(P9FS_NTOV(dnp), *vpp, cnp); } P9_DEBUG(VOPS, "%s: created file under vp %p node %p fid %ju\n", __func__, *vpp, dnp, (uintmax_t)dvfid->fid); cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; return (error); out: if (newfid != NULL) p9_client_clunk(newfid); cnp->cn_nameptr[cnp->cn_namelen] = tmpchr; return (error); } /* Create hard link */ static int p9fs_link(struct vop_link_args *ap) { struct vnode *vp; struct vnode *tdvp; struct componentname *cnp; struct p9fs_node *dnp; struct p9fs_node *np; struct p9fs_inode *inode; struct p9fs_session *vses; struct p9_fid *dvfid, *oldvfid; int error; vp = ap->a_vp; tdvp = ap->a_tdvp; cnp = ap->a_cnp; dnp = P9FS_VTON(tdvp); np = P9FS_VTON(vp); inode = &np->inode; vses = np->p9fs_ses; error = 0; P9_DEBUG(VOPS, "%s: tdvp %p vp %p\n", __func__, tdvp, vp); dvfid = p9fs_get_fid(vses->clnt, dnp, cnp->cn_cred, VFID, -1, &error); if (error != 0) return (error); oldvfid = p9fs_get_fid(vses->clnt, np, cnp->cn_cred, VFID, -1, &error); if (error != 0) return (error); error = p9_create_hardlink(dvfid, oldvfid, cnp->cn_nameptr); if (error != 0) return (error); /* Increment ref count on the inode */ P9FS_INCR_LINKS(inode); return (0); } /* Read contents of the symbolic link */ static int p9fs_readlink(struct vop_readlink_args *ap) { struct vnode *vp; struct uio *uio; struct p9fs_node *dnp; struct p9fs_session *vses; struct p9_fid *dvfid; int error, len; char *target; vp = ap->a_vp; uio = ap->a_uio; dnp = P9FS_VTON(vp); vses = dnp->p9fs_ses; error = 0; P9_DEBUG(VOPS, "%s: vp %p\n", __func__, vp); dvfid = p9fs_get_fid(vses->clnt, dnp, ap->a_cred, VFID, -1, &error); if (error != 0) return (error); error = p9_readlink(dvfid, &target); if (error != 0) return (error); len = strlen(target); error = uiomove(target, len, uio); return (0); } /* * Iterate through a directory. An entire 8k data is read into the I/O buffer. * This buffer is parsed to make dir entries and fed to the user buffer to * complete it to the VFS. */ static int p9fs_readdir(struct vop_readdir_args *ap) { struct uio *uio; struct vnode *vp; struct dirent cde; int64_t offset; uint64_t diroffset; struct p9fs_node *np; int error; int32_t count; struct p9_client *clnt; struct p9_dirent dent; char *io_buffer; struct p9_fid *vofid; uio = ap->a_uio; vp = ap->a_vp; np = P9FS_VTON(ap->a_vp); offset = 0; diroffset = 0; error = 0; count = 0; clnt = np->p9fs_ses->clnt; P9_DEBUG(VOPS, "%s: vp %p, offset %jd, resid %zd\n", __func__, vp, (intmax_t) uio->uio_offset, uio->uio_resid); if (ap->a_uio->uio_iov->iov_len <= 0) return (EINVAL); if (vp->v_type != VDIR) return (ENOTDIR); vofid = p9fs_get_fid(clnt, np, ap->a_cred, VOFID, P9PROTO_OREAD, &error); if (vofid == NULL) { P9_DEBUG(ERROR, "%s: NULL FID\n", __func__); return (EBADF); } io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK); /* We haven't reached the end yet. read more. */ diroffset = uio->uio_offset; while (uio->uio_resid >= sizeof(struct dirent)) { /* * We need to read more data as what is indicated by filesize because * filesize is based on data stored in struct dirent structure but * we read data in struct p9_dirent format which has different size. * Hence we read max data(P9FS_IOUNIT) everytime from host, convert * it into struct dirent structure and send it back. */ count = P9FS_IOUNIT; bzero(io_buffer, P9FS_MTU); count = p9_client_readdir(vofid, (char *)io_buffer, diroffset, count); if (count == 0) break; if (count < 0) { error = EIO; goto out; } offset = 0; while (offset + QEMU_DIRENTRY_SZ <= count) { /* * Read and make sense out of the buffer in one dirent * This is part of 9p protocol read. This reads one p9_dirent, * appends it to dirent(FREEBSD specifc) and continues to parse the buffer. */ bzero(&dent, sizeof(dent)); offset = p9_dirent_read(clnt, io_buffer, offset, count, &dent); if (offset < 0 || offset > count) { error = EIO; goto out; } bzero(&cde, sizeof(cde)); strncpy(cde.d_name, dent.d_name, dent.len); cde.d_fileno = dent.qid.path; cde.d_type = dent.d_type; cde.d_namlen = dent.len; cde.d_reclen = GENERIC_DIRSIZ(&cde); /* * If there isn't enough space in the uio to return a * whole dirent, break off read */ if (uio->uio_resid < GENERIC_DIRSIZ(&cde)) break; /* Transfer */ error = uiomove(&cde, GENERIC_DIRSIZ(&cde), uio); if (error != 0) { error = EIO; goto out; } diroffset = dent.d_off; } } /* Pass on last transferred offset */ uio->uio_offset = diroffset; out: uma_zfree(p9fs_io_buffer_zone, io_buffer); return (error); } static void p9fs_doio(struct vnode *vp, struct buf *bp, struct p9_fid *vofid, struct ucred *cr) { struct uio *uiov; struct iovec io; int error; uint64_t off, offset; uint64_t filesize; uint64_t resid; uint32_t count; int64_t ret; struct p9fs_node *np; char *io_buffer; error = 0; np = P9FS_VTON(vp); filesize = np->inode.i_size; uiov = malloc(sizeof(struct uio), M_P9UIOV, M_WAITOK); uiov->uio_iov = &io; uiov->uio_iovcnt = 1; uiov->uio_segflg = UIO_SYSSPACE; io_buffer = uma_zalloc(p9fs_io_buffer_zone, M_WAITOK | M_ZERO); if (bp->b_iocmd == BIO_READ) { io.iov_len = uiov->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiov->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: { uiov->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; if (uiov->uio_resid) { int left = uiov->uio_resid; int nread = bp->b_bcount - left; if (left > 0) bzero((char *)bp->b_data + nread, left); } /* where in the file are we to start reading */ offset = uiov->uio_offset; if (uiov->uio_offset >= filesize) goto out; while ((resid = uiov->uio_resid) > 0) { if (offset >= filesize) break; count = min(filesize - uiov->uio_offset, resid); if (count == 0) break; P9_DEBUG(VOPS, "%s: read called %#zx at %#jx\n", __func__, uiov->uio_resid, (uintmax_t)uiov->uio_offset); /* Copy count bytes into the uio */ ret = p9_client_read(vofid, offset, count, io_buffer); error = uiomove(io_buffer, ret, uiov); if (error != 0) goto out; offset += ret; } break; } default: printf("vfs: type %x unexpected\n", vp->v_type); break; } } else { if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiov->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiov->uio_offset = ((off_t)bp->b_blkno) * PAGE_SIZE + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiov->uio_rw = UIO_WRITE; if (uiov->uio_offset < 0) { error = EINVAL; goto out; } if (uiov->uio_resid == 0) goto out; resid = uiov->uio_resid; offset = uiov->uio_offset; error = 0; while ((resid = uiov->uio_resid) > 0) { off = 0; count = MIN(resid, P9FS_IOUNIT); error = uiomove(io_buffer, count, uiov); if (error != 0) { goto out; } while (count > 0) { /* Copy count bytes from the uio */ ret = p9_client_write(vofid, offset, count, io_buffer + off); if (ret < 0) goto out; P9_DEBUG(VOPS, "%s: write called %#zx at %#jx\n", __func__, uiov->uio_resid, (uintmax_t)uiov->uio_offset); off += ret; offset += ret; count -= ret; } } /* Update the fields in the node to reflect the change */ if (filesize < uiov->uio_offset + uiov->uio_resid) { np->inode.i_size = uiov->uio_offset + uiov->uio_resid; vnode_pager_setsize(vp, uiov->uio_offset + uiov->uio_resid); /* update the modified timers. */ p9fs_itimes(vp); } } else { bp->b_resid = 0; goto out1; } } out: /* Set the error */ if (error != 0) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; } bp->b_resid = uiov->uio_resid; out1: bufdone(bp); uma_zfree(p9fs_io_buffer_zone, io_buffer); free(uiov, M_P9UIOV); } /* * The I/O buffer is mapped to a uio and a client_write/client_read is performed * the same way as p9fs_read and p9fs_write. */ static int p9fs_strategy(struct vop_strategy_args *ap) { struct vnode *vp; struct buf *bp; struct ucred *cr; int error; struct open_fid_state ostate; vp = ap->a_vp; bp = ap->a_bp; error = 0; P9_DEBUG(VOPS, "%s: vp %p, iocmd %d\n ", __func__, vp, bp->b_iocmd); if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; error = p9fs_get_open_fid(vp, bp->b_iocmd == BIO_READ ? FREAD : FWRITE, cr, &ostate); if (error) { P9_DEBUG(ERROR, "%s: p9fs_get_open_fid failed: %d\n", __func__, error); bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } p9fs_doio(vp, bp, ostate.vofid, cr); p9fs_release_open_fid(vp, cr, &ostate); return (0); } /* Rename a file */ static int p9fs_rename(struct vop_rename_args *ap) { struct vnode *tvp; struct vnode *tdvp; struct vnode *fvp; struct vnode *fdvp; struct componentname *tcnp; struct componentname *fcnp; struct p9fs_node *tdnode; struct p9fs_node *fdnode; struct p9fs_inode *fdinode; struct p9fs_node *fnode; struct p9fs_inode *finode; struct p9fs_session *vses; struct p9fs_node *tnode; struct p9fs_inode *tinode; struct p9_fid *olddirvfid, *newdirvfid ; int error; tvp = ap->a_tvp; tdvp = ap->a_tdvp; fvp = ap->a_fvp; fdvp = ap->a_fdvp; tcnp = ap->a_tcnp; fcnp = ap->a_fcnp; tdnode = P9FS_VTON(tdvp); fdnode = P9FS_VTON(fdvp); fdinode = &fdnode->inode; fnode = P9FS_VTON(fvp); finode = &fnode->inode; vses = fnode->p9fs_ses; error = 0; P9_DEBUG(VOPS, "%s: tvp %p, tdvp %p, fvp %p, fdvp %p\n ", __func__, tvp, tdvp, fvp, fdvp); /* Check for cross mount operation */ if (fvp->v_mount != tdvp->v_mount || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } /* warning if you are renaming to the same name */ if (fvp == tvp) error = 0; olddirvfid = p9fs_get_fid(vses->clnt, fdnode, fcnp->cn_cred, VFID, -1, &error); if (error != 0) goto out; newdirvfid = p9fs_get_fid(vses->clnt, tdnode, tcnp->cn_cred, VFID, -1, &error); if (error != 0) goto out; error = p9_client_renameat(olddirvfid, fcnp->cn_nameptr, newdirvfid, tcnp->cn_nameptr); if (error != 0) goto out; /* * decrement the link count on the "from" file whose name is going * to be changed if its a directory */ if (fvp->v_type == VDIR) { if (tvp && tvp->v_type == VDIR) cache_purge(tdvp); P9FS_DECR_LINKS(fdinode); cache_purge(fdvp); } /* Taking exclusive lock on the from node before decrementing the link count */ if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto out; P9FS_DECR_LINKS(finode); VOP_UNLOCK(fvp); if (tvp) { tnode = P9FS_VTON(tvp); tinode = &tnode->inode; P9FS_DECR_LINKS(tinode); } out: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } /* * Put VM pages, synchronously. * XXX: like smbfs, cannot use vop_stdputpages due to mapping requirement */ static int p9fs_putpages(struct vop_putpages_args *ap) { struct uio uio; struct iovec iov; int i, error, npages, count; off_t offset; int *rtvals; struct vnode *vp; struct thread *td; struct ucred *cred; struct p9fs_node *np; vm_page_t *pages; vm_offset_t kva; struct buf *bp; vp = ap->a_vp; np = P9FS_VTON(vp); td = curthread; cred = curthread->td_ucred; pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); offset = IDX_TO_OFF(pages[0]->pindex); /* * When putting pages, do not extend file past EOF. */ if (offset + count > np->inode.i_size) { count = np->inode.i_size - offset; if (count < 0) count = 0; } for (i = 0; i < npages; i++) rtvals[i] = VM_PAGER_ERROR; bp = uma_zalloc(p9fs_pbuf_zone, M_WAITOK); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, count); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; P9_DEBUG(VOPS, "of=%jd,resid=%zd\n", (intmax_t)uio.uio_offset, uio.uio_resid); error = VOP_WRITE(vp, &uio, vnode_pager_putpages_ioflags(ap->a_sync), cred); pmap_qremove(kva, npages); uma_zfree(p9fs_pbuf_zone, bp); if (error == 0) vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid, np->inode.i_size - offset, npages * PAGE_SIZE); return (rtvals[0]); } struct vop_vector p9fs_vnops = { .vop_default = &default_vnodeops, .vop_lookup = p9fs_lookup, .vop_open = p9fs_open, .vop_close = p9fs_close, .vop_access = p9fs_access, .vop_getattr = p9fs_getattr_dotl, .vop_setattr = p9fs_setattr_dotl, .vop_reclaim = p9fs_reclaim, .vop_inactive = p9fs_inactive, .vop_readdir = p9fs_readdir, .vop_create = p9fs_create, .vop_mknod = p9fs_mknod, .vop_read = p9fs_read, .vop_write = p9fs_write, .vop_remove = p9fs_remove, .vop_mkdir = p9fs_mkdir, .vop_rmdir = p9fs_rmdir, .vop_strategy = p9fs_strategy, .vop_symlink = p9fs_symlink, .vop_rename = p9fs_rename, .vop_link = p9fs_link, .vop_readlink = p9fs_readlink, .vop_putpages = p9fs_putpages, }; VFS_VOP_VECTOR_REGISTER(p9fs_vnops); diff --git a/sys/fs/smbfs/smbfs_vnops.c b/sys/fs/smbfs/smbfs_vnops.c index 1e7dcafb1121..c30995508c00 100644 --- a/sys/fs/smbfs/smbfs_vnops.c +++ b/sys/fs/smbfs/smbfs_vnops.c @@ -1,1248 +1,1248 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Prototypes for SMBFS vnode operations */ static vop_create_t smbfs_create; static vop_mknod_t smbfs_mknod; static vop_open_t smbfs_open; static vop_close_t smbfs_close; static vop_access_t smbfs_access; static vop_getattr_t smbfs_getattr; static vop_setattr_t smbfs_setattr; static vop_read_t smbfs_read; static vop_write_t smbfs_write; static vop_fsync_t smbfs_fsync; static vop_remove_t smbfs_remove; static vop_link_t smbfs_link; static vop_lookup_t smbfs_lookup; static vop_rename_t smbfs_rename; static vop_mkdir_t smbfs_mkdir; static vop_rmdir_t smbfs_rmdir; static vop_symlink_t smbfs_symlink; static vop_readdir_t smbfs_readdir; static vop_strategy_t smbfs_strategy; static vop_print_t smbfs_print; static vop_pathconf_t smbfs_pathconf; static vop_advlock_t smbfs_advlock; static vop_getextattr_t smbfs_getextattr; struct vop_vector smbfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = smbfs_access, .vop_advlock = smbfs_advlock, .vop_close = smbfs_close, .vop_create = smbfs_create, .vop_fsync = smbfs_fsync, .vop_getattr = smbfs_getattr, .vop_getextattr = smbfs_getextattr, .vop_getpages = smbfs_getpages, .vop_inactive = smbfs_inactive, .vop_ioctl = smbfs_ioctl, .vop_link = smbfs_link, .vop_lookup = smbfs_lookup, .vop_mkdir = smbfs_mkdir, .vop_mknod = smbfs_mknod, .vop_open = smbfs_open, .vop_pathconf = smbfs_pathconf, .vop_print = smbfs_print, .vop_putpages = smbfs_putpages, .vop_read = smbfs_read, .vop_readdir = smbfs_readdir, .vop_reclaim = smbfs_reclaim, .vop_remove = smbfs_remove, .vop_rename = smbfs_rename, .vop_rmdir = smbfs_rmdir, .vop_setattr = smbfs_setattr, /* .vop_setextattr = smbfs_setextattr,*/ .vop_strategy = smbfs_strategy, .vop_symlink = smbfs_symlink, .vop_write = smbfs_write, }; VFS_VOP_VECTOR_REGISTER(smbfs_vnodeops); static int smbfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; mode_t mpmode; struct smbmount *smp = VTOSMBFS(vp); SMBVDEBUG("\n"); if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: return EROFS; default: break; } } mpmode = vp->v_type == VREG ? smp->sm_file_mode : smp->sm_dir_mode; return (vaccess(vp->v_type, mpmode, smp->sm_uid, smp->sm_gid, ap->a_accmode, ap->a_cred)); } /* ARGSUSED */ static int smbfs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; struct vattr vattr; int mode = ap->a_mode; int error, accmode; SMBVDEBUG("%s,%d\n", np->n_name, (np->n_flag & NOPEN) != 0); if (vp->v_type != VREG && vp->v_type != VDIR) { SMBFSERR("open eacces vtype=%d\n", vp->v_type); return EACCES; } if (vp->v_type == VDIR) { np->n_flag |= NOPEN; return 0; } if (np->n_flag & NMODIFIED) { if ((error = smbfs_vinvalbuf(vp, ap->a_td)) == EINTR) return error; smbfs_attr_cacheremove(vp); error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } else { error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return error; if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) { error = smbfs_vinvalbuf(vp, ap->a_td); if (error == EINTR) return error; np->n_mtime.tv_sec = vattr.va_mtime.tv_sec; } } if ((np->n_flag & NOPEN) != 0) return 0; /* * Use DENYNONE to give unixy semantics of permitting * everything not forbidden by permissions. Ie denial * is up to server with clients/openers needing to use * advisory locks for further control. */ accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) accmode = SMB_SM_DENYNONE|SMB_AM_OPENRW; scred = smbfs_malloc_scred(); smb_makescred(scred, ap->a_td, ap->a_cred); error = smbfs_smb_open(np, accmode, scred); if (error) { if (mode & FWRITE) return EACCES; else if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD; error = smbfs_smb_open(np, accmode, scred); } } if (error == 0) { np->n_flag |= NOPEN; vnode_create_vobject(ap->a_vp, vattr.va_size, ap->a_td); } smbfs_attr_cacheremove(vp); smbfs_free_scred(scred); return error; } static int smbfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; if (vp->v_type == VDIR && (np->n_flag & NOPEN) != 0 && np->n_dirseq != NULL) { scred = smbfs_malloc_scred(); smb_makescred(scred, td, ap->a_cred); smbfs_findclose(np->n_dirseq, scred); smbfs_free_scred(scred); np->n_dirseq = NULL; } return 0; } /* * smbfs_getattr call from vfs. */ static int smbfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct vattr *va=ap->a_vap; struct smbfattr fattr; struct smb_cred *scred; u_quad_t oldsize; int error; SMBVDEBUG("%lx: '%s' %d\n", (long)vp, np->n_name, (vp->v_vflag & VV_ROOT) != 0); error = smbfs_attr_cachelookup(vp, va); if (!error) return 0; SMBVDEBUG("not in the cache\n"); scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, ap->a_cred); oldsize = np->n_size; error = smbfs_smb_lookup(np, NULL, 0, &fattr, scred); if (error) { SMBVDEBUG("error %d\n", error); smbfs_free_scred(scred); return error; } smbfs_attr_cacheenter(vp, &fattr); smbfs_attr_cachelookup(vp, va); if (np->n_flag & NOPEN) np->n_size = oldsize; smbfs_free_scred(scred); return 0; } static int smbfs_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct vattr *vap = ap->a_vap; struct timespec *mtime, *atime; struct smb_cred *scred; struct smb_share *ssp = np->n_mount->sm_share; struct smb_vc *vcp = SSTOVC(ssp); struct thread *td = curthread; u_quad_t tsize = 0; int isreadonly, doclose, error = 0; int old_n_dosattr; SMBVDEBUG("\n"); isreadonly = (vp->v_mount->mnt_flag & MNT_RDONLY); /* * Disallow write attempts if the filesystem is mounted read-only. */ if ((vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL || vap->va_flags != VNOVAL) && isreadonly) return EROFS; /* * We only support setting four flags. Don't allow setting others. * * We map UF_READONLY to SMB_FA_RDONLY, unlike the MacOS X version * of this code, which maps both UF_IMMUTABLE AND SF_IMMUTABLE to * SMB_FA_RDONLY. The immutable flags have different semantics * than readonly, which is the reason for the difference. */ if (vap->va_flags != VNOVAL) { if (vap->va_flags & ~(UF_HIDDEN|UF_SYSTEM|UF_ARCHIVE| UF_READONLY)) return EINVAL; } scred = smbfs_malloc_scred(); smb_makescred(scred, td, ap->a_cred); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: error = EISDIR; goto out; case VREG: break; default: error = EINVAL; goto out; } if (isreadonly) { error = EROFS; goto out; } doclose = 0; vnode_pager_setsize(vp, (u_long)vap->va_size); tsize = np->n_size; np->n_size = vap->va_size; if ((np->n_flag & NOPEN) == 0) { error = smbfs_smb_open(np, SMB_SM_DENYNONE|SMB_AM_OPENRW, scred); if (error == 0) doclose = 1; } if (error == 0) error = smbfs_smb_setfsize(np, (int64_t)vap->va_size, scred); if (doclose) smbfs_smb_close(ssp, np->n_fid, NULL, scred); if (error) { np->n_size = tsize; vnode_pager_setsize(vp, (u_long)tsize); goto out; } } if ((vap->va_flags != VNOVAL) || (vap->va_mode != (mode_t)VNOVAL)) { old_n_dosattr = np->n_dosattr; if (vap->va_mode != (mode_t)VNOVAL) { if (vap->va_mode & S_IWUSR) np->n_dosattr &= ~SMB_FA_RDONLY; else np->n_dosattr |= SMB_FA_RDONLY; } if (vap->va_flags != VNOVAL) { if (vap->va_flags & UF_HIDDEN) np->n_dosattr |= SMB_FA_HIDDEN; else np->n_dosattr &= ~SMB_FA_HIDDEN; if (vap->va_flags & UF_SYSTEM) np->n_dosattr |= SMB_FA_SYSTEM; else np->n_dosattr &= ~SMB_FA_SYSTEM; if (vap->va_flags & UF_ARCHIVE) np->n_dosattr |= SMB_FA_ARCHIVE; else np->n_dosattr &= ~SMB_FA_ARCHIVE; /* * We only support setting the immutable / readonly * bit for regular files. According to comments in * the MacOS X version of this code, supporting the * readonly bit on directories doesn't do the same * thing in Windows as in Unix. */ if (vp->v_type == VREG) { if (vap->va_flags & UF_READONLY) np->n_dosattr |= SMB_FA_RDONLY; else np->n_dosattr &= ~SMB_FA_RDONLY; } } if (np->n_dosattr != old_n_dosattr) { error = smbfs_smb_setpattr(np, np->n_dosattr, NULL, scred); if (error) goto out; } } mtime = atime = NULL; if (vap->va_mtime.tv_sec != VNOVAL) mtime = &vap->va_mtime; if (vap->va_atime.tv_sec != VNOVAL) atime = &vap->va_atime; if (mtime != atime) { if (vap->va_vaflags & VA_UTIMES_NULL) { error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td); if (error) error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td); } else error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td); #if 0 if (mtime == NULL) mtime = &np->n_mtime; if (atime == NULL) atime = &np->n_atime; #endif /* * If file is opened, then we can use handle based calls. * If not, use path based ones. */ if ((np->n_flag & NOPEN) == 0) { if (vcp->vc_flags & SMBV_WIN95) { error = VOP_OPEN(vp, FWRITE, ap->a_cred, td, NULL); if (!error) { /* error = smbfs_smb_setfattrNT(np, 0, mtime, atime, scred); VOP_GETATTR(vp, &vattr, ap->a_cred); */ if (mtime) np->n_mtime = *mtime; VOP_CLOSE(vp, FWRITE, ap->a_cred, td); } } else if ((vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS)) { error = smbfs_smb_setptime2(np, mtime, atime, 0, scred); /* error = smbfs_smb_setpattrNT(np, 0, mtime, atime, scred);*/ } else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN2_0) { error = smbfs_smb_setptime2(np, mtime, atime, 0, scred); } else { error = smbfs_smb_setpattr(np, 0, mtime, scred); } } else { if (vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS) { error = smbfs_smb_setfattrNT(np, 0, mtime, atime, scred); } else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN1_0) { error = smbfs_smb_setftime(np, mtime, atime, scred); } else { /* * I have no idea how to handle this for core * level servers. The possible solution is to * update mtime after file is closed. */ SMBERROR("can't update times on an opened file\n"); } } } /* * Invalidate attribute cache in case if server doesn't set * required attributes. */ smbfs_attr_cacheremove(vp); /* invalidate cache */ VOP_GETATTR(vp, vap, ap->a_cred); np->n_mtime.tv_sec = vap->va_mtime.tv_sec; out: smbfs_free_scred(scred); return error; } /* * smbfs_read call. */ static int smbfs_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; SMBVDEBUG("\n"); if (vp->v_type != VREG && vp->v_type != VDIR) return EPERM; return smbfs_readvnode(vp, uio, ap->a_cred); } static int smbfs_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; SMBVDEBUG("%d,ofs=%jd,sz=%zd\n",vp->v_type, (intmax_t)uio->uio_offset, uio->uio_resid); if (vp->v_type != VREG) return (EPERM); return smbfs_writevnode(vp, uio, ap->a_cred,ap->a_ioflag); } /* * smbfs_create call * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error. */ static int smbfs_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct vnode **vpp=ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct smbnode *dnp = VTOSMB(dvp); struct vnode *vp; struct vattr vattr; struct smbfattr fattr; struct smb_cred *scred; char *name = cnp->cn_nameptr; int nmlen = cnp->cn_namelen; int error; SMBVDEBUG("\n"); *vpp = NULL; if (vap->va_type != VREG) return EOPNOTSUPP; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred))) return error; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_create(dnp, name, nmlen, scred); if (error) goto out; error = smbfs_smb_lookup(dnp, name, nmlen, &fattr, scred); if (error) goto out; error = smbfs_nget(VTOVFS(dvp), dvp, name, nmlen, &fattr, &vp); if (error) goto out; *vpp = vp; if (cnp->cn_flags & MAKEENTRY) cache_enter(dvp, vp, cnp); out: smbfs_free_scred(scred); return error; } static int smbfs_remove(struct vop_remove_args *ap) { struct vnode *vp = ap->a_vp; /* struct vnode *dvp = ap->a_dvp;*/ struct componentname *cnp = ap->a_cnp; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; int error; if (vp->v_type == VDIR || (np->n_flag & NOPEN) != 0 || vrefcnt(vp) != 1) return EPERM; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_delete(np, scred); if (error == 0) np->n_flag |= NGONE; cache_purge(vp); smbfs_free_scred(scred); return error; } /* * smbfs_file rename call */ static int smbfs_rename(struct vop_rename_args *ap) { struct vnode *fvp = ap->a_fvp; struct vnode *tvp = ap->a_tvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tdvp = ap->a_tdvp; struct componentname *tcnp = ap->a_tcnp; /* struct componentname *fcnp = ap->a_fcnp;*/ struct smb_cred *scred; #ifdef notnow u_int16_t flags = 6; #endif int error=0; scred = NULL; /* Check for cross-device rename */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; goto out; } if (tvp && vrefcnt(tvp) > 1) { error = EBUSY; goto out; } #ifdef notnow flags = 0x10; /* verify all writes */ #endif if (fvp->v_type == VDIR) { #ifdef notnow flags |= 2; #endif } else if (fvp->v_type == VREG) { #ifdef notnow flags |= 1; #endif } else { return EINVAL; } scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, tcnp->cn_cred); /* * It seems that Samba doesn't implement SMB_COM_MOVE call... */ #ifdef notnow if (SMB_DIALECT(SSTOCN(smp->sm_share)) >= SMB_DIALECT_LANMAN1_0) { error = smbfs_smb_move(VTOSMB(fvp), VTOSMB(tdvp), tcnp->cn_nameptr, tcnp->cn_namelen, flags, scred); } else #endif { /* * We have to do the work atomicaly */ if (tvp && tvp != fvp) { error = smbfs_smb_delete(VTOSMB(tvp), scred); if (error) goto out_cacherem; VTOSMB(fvp)->n_flag |= NGONE; } error = smbfs_smb_rename(VTOSMB(fvp), VTOSMB(tdvp), tcnp->cn_nameptr, tcnp->cn_namelen, scred); } if (fvp->v_type == VDIR) { if (tvp != NULL && tvp->v_type == VDIR) cache_purge(tdvp); cache_purge(fdvp); } out_cacherem: smbfs_attr_cacheremove(fdvp); smbfs_attr_cacheremove(tdvp); out: smbfs_free_scred(scred); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); #ifdef possible_mistake vgone(fvp); if (tvp) vgone(tvp); #endif return error; } /* * somtime it will come true... */ static int smbfs_link(struct vop_link_args *ap) { return EOPNOTSUPP; } /* * smbfs_symlink link create call. * Sometime it will be functional... */ static int smbfs_symlink(struct vop_symlink_args *ap) { return EOPNOTSUPP; } static int smbfs_mknod(struct vop_mknod_args *ap) { return EOPNOTSUPP; } static int smbfs_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; /* struct vattr *vap = ap->a_vap;*/ struct vnode *vp; struct componentname *cnp = ap->a_cnp; struct smbnode *dnp = VTOSMB(dvp); struct vattr vattr; struct smb_cred *scred; struct smbfattr fattr; char *name = cnp->cn_nameptr; int len = cnp->cn_namelen; int error; if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred))) { return error; } if ((name[0] == '.') && ((len == 1) || ((len == 2) && (name[1] == '.')))) return EEXIST; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_mkdir(dnp, name, len, scred); if (error) goto out; error = smbfs_smb_lookup(dnp, name, len, &fattr, scred); if (error) goto out; error = smbfs_nget(VTOVFS(dvp), dvp, name, len, &fattr, &vp); if (error) goto out; *ap->a_vpp = vp; out: smbfs_free_scred(scred); return error; } /* * smbfs_remove directory call */ static int smbfs_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; /* struct smbmount *smp = VTOSMBFS(vp);*/ struct smbnode *dnp = VTOSMB(dvp); struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; int error; if (dvp == vp) return EINVAL; scred = smbfs_malloc_scred(); smb_makescred(scred, curthread, cnp->cn_cred); error = smbfs_smb_rmdir(np, scred); if (error == 0) np->n_flag |= NGONE; dnp->n_flag |= NMODIFIED; smbfs_attr_cacheremove(dvp); /* cache_purge(dvp);*/ cache_purge(vp); smbfs_free_scred(scred); return error; } /* * smbfs_readdir call */ static int smbfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int error; if (vp->v_type != VDIR) return (EPERM); #ifdef notnow if (ap->a_ncookies) { printf("smbfs_readdir: no support for cookies now..."); return (EOPNOTSUPP); } #endif error = smbfs_readvnode(vp, uio, ap->a_cred); return error; } /* ARGSUSED */ static int smbfs_fsync(struct vop_fsync_args *ap) { /* return (smb_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_td, 1));*/ return (0); } static int smbfs_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); if (np == NULL) { printf("no smbnode data\n"); return (0); } printf("\tname = %s, parent = %p, open = %d\n", np->n_name, np->n_parent ? np->n_parent : NULL, (np->n_flag & NOPEN) != 0); return (0); } static int smbfs_pathconf(struct vop_pathconf_args *ap) { struct smbmount *smp = VFSTOSMBFS(VTOVFS(ap->a_vp)); struct smb_vc *vcp = SSTOVC(smp->sm_share); long *retval = ap->a_retval; int error = 0; switch (ap->a_name) { case _PC_FILESIZEBITS: if (vcp->vc_sopt.sv_caps & (SMB_CAP_LARGE_READX | SMB_CAP_LARGE_WRITEX)) *retval = 64; else *retval = 32; break; case _PC_NAME_MAX: *retval = (vcp->vc_hflags2 & SMB_FLAGS2_KNOWS_LONG_NAMES) ? 255 : 12; break; case _PC_PATH_MAX: *retval = 800; /* XXX: a correct one ? */ break; case _PC_NO_TRUNC: *retval = 1; break; default: error = vop_stdpathconf(ap); } return error; } static int smbfs_strategy(struct vop_strategy_args *ap) { struct buf *bp=ap->a_bp; struct ucred *cr; struct thread *td; SMBVDEBUG("\n"); if (bp->b_flags & B_ASYNC) td = (struct thread *)0; else td = curthread; /* XXX */ if (bp->b_iocmd == BIO_READ) cr = bp->b_rcred; else cr = bp->b_wcred; if ((bp->b_flags & B_ASYNC) == 0 ) (void)smbfs_doio(ap->a_vp, bp, cr, td); return (0); } int smbfs_ioctl(struct vop_ioctl_args *ap) { return ENOTTY; } static char smbfs_atl[] = "rhsvda"; static int smbfs_getextattr(struct vop_getextattr_args *ap) /* { IN struct vnode *a_vp; IN char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; struct uio *uio = ap->a_uio; const char *name = ap->a_name; struct smbnode *np = VTOSMB(vp); struct vattr vattr; char buf[10]; int i, attr, error; error = VOP_ACCESS(vp, VREAD, cred, td); if (error) return error; error = VOP_GETATTR(vp, &vattr, cred); if (error) return error; if (strcmp(name, "dosattr") == 0) { attr = np->n_dosattr; for (i = 0; i < 6; i++, attr >>= 1) buf[i] = (attr & 1) ? smbfs_atl[i] : '-'; buf[i] = 0; error = uiomove(buf, i, uio); } else error = EINVAL; return error; } /* * Since we expected to support F_GETLK (and SMB protocol has no such function), * it is necessary to use lf_advlock(). It would be nice if this function had * a callback mechanism because it will help to improve a level of consistency. */ int smbfs_advlock(struct vop_advlock_args *ap) { struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct flock *fl = ap->a_fl; caddr_t id = (caddr_t)1 /* ap->a_id */; /* int flags = ap->a_flags;*/ struct thread *td = curthread; struct smb_cred *scred; u_quad_t size; off_t start, end, oadd; int error, lkop; if (vp->v_type == VDIR) { /* * SMB protocol have no support for directory locking. * Although locks can be processed on local machine, I don't * think that this is a good idea, because some programs * can work wrong assuming directory is locked. So, we just * return 'operation not supported */ return EOPNOTSUPP; } size = np->n_size; switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: start = fl->l_start; break; case SEEK_END: if (size > OFF_MAX || (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) return EOVERFLOW; start = size + fl->l_start; break; default: return EINVAL; } if (start < 0) return EINVAL; if (fl->l_len < 0) { if (start == 0) return EINVAL; end = start - 1; start += fl->l_len; if (start < 0) return EINVAL; } else if (fl->l_len == 0) end = -1; else { oadd = fl->l_len - 1; if (oadd > OFF_MAX - start) return EOVERFLOW; end = start + oadd; } scred = smbfs_malloc_scred(); smb_makescred(scred, td, td->td_ucred); switch (ap->a_op) { case F_SETLK: switch (fl->l_type) { case F_WRLCK: lkop = SMB_LOCK_EXCL; break; case F_RDLCK: lkop = SMB_LOCK_SHARED; break; case F_UNLCK: lkop = SMB_LOCK_RELEASE; break; default: smbfs_free_scred(scred); return EINVAL; } error = lf_advlock(ap, &vp->v_lockf, size); if (error) break; lkop = SMB_LOCK_EXCL; error = smbfs_smb_lock(np, lkop, id, start, end, scred); if (error) { int oldtype = fl->l_type; fl->l_type = F_UNLCK; ap->a_op = F_UNLCK; lf_advlock(ap, &vp->v_lockf, size); fl->l_type = oldtype; } break; case F_UNLCK: lf_advlock(ap, &vp->v_lockf, size); error = smbfs_smb_lock(np, SMB_LOCK_RELEASE, id, start, end, scred); break; case F_GETLK: error = lf_advlock(ap, &vp->v_lockf, size); break; default: smbfs_free_scred(scred); return EINVAL; } smbfs_free_scred(scred); return error; } static int smbfs_pathcheck(struct smbmount *smp, const char *name, int nmlen, int nameiop) { static const char *badchars = "*/:<>?"; static const char *badchars83 = " +|,[]=;"; const char *cp; int i, error; /* * Backslash characters, being a path delimiter, are prohibited * within a path component even for LOOKUP operations. */ if (strchr(name, '\\') != NULL) return ENOENT; if (nameiop == LOOKUP) return 0; error = ENOENT; if (SMB_DIALECT(SSTOVC(smp->sm_share)) < SMB_DIALECT_LANMAN2_0) { /* * Name should conform 8.3 format */ if (nmlen > 12) return ENAMETOOLONG; cp = strchr(name, '.'); if (cp == NULL) return error; if (cp == name || (cp - name) > 8) return error; cp = strchr(cp + 1, '.'); if (cp != NULL) return error; for (cp = name, i = 0; i < nmlen; i++, cp++) if (strchr(badchars83, *cp) != NULL) return error; } for (cp = name, i = 0; i < nmlen; i++, cp++) if (strchr(badchars, *cp) != NULL) return error; return 0; } /* * Things go even weird without fixed inode numbers... */ int smbfs_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct thread *td = curthread; struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct vnode *vp; struct smbmount *smp; struct mount *mp = dvp->v_mount; struct smbnode *dnp; struct smbfattr fattr, *fap; struct smb_cred *scred; char *name = cnp->cn_nameptr; - int flags = cnp->cn_flags; + uint64_t flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; int nmlen = cnp->cn_namelen; int error, islastcn, isdot; int killit; SMBVDEBUG("\n"); if (dvp->v_type != VDIR) return ENOTDIR; if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) { SMBFSERR("invalid '..'\n"); return EIO; } islastcn = flags & ISLASTCN; if (islastcn && (mp->mnt_flag & MNT_RDONLY) && (nameiop != LOOKUP)) return EROFS; error = vn_dir_check_exec(dvp, cnp); if (error != 0) return error; smp = VFSTOSMBFS(mp); dnp = VTOSMB(dvp); isdot = (nmlen == 1 && name[0] == '.'); error = smbfs_pathcheck(smp, cnp->cn_nameptr, cnp->cn_namelen, nameiop); if (error) return ENOENT; error = cache_lookup(dvp, vpp, cnp, NULL, NULL); SMBVDEBUG("cache_lookup returned %d\n", error); if (error > 0) return error; if (error) { /* name was found */ struct vattr vattr; killit = 0; vp = *vpp; error = VOP_GETATTR(vp, &vattr, cnp->cn_cred); /* * If the file type on the server is inconsistent * with what it was when we created the vnode, * kill the bogus vnode now and fall through to * the code below to create a new one with the * right type. */ if (error == 0 && ((vp->v_type == VDIR && (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) == 0) || (vp->v_type == VREG && (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) != 0))) killit = 1; else if (error == 0 /* && vattr.va_ctime.tv_sec == VTOSMB(vp)->n_ctime*/) { SMBVDEBUG("use cached vnode\n"); return (0); } cache_purge(vp); /* * XXX This is not quite right, if '.' is * inconsistent, we really need to start the lookup * all over again. Hopefully there is some other * guarantee that prevents this case from happening. */ if (killit && vp != dvp) vgone(vp); if (vp != dvp) vput(vp); else vrele(vp); *vpp = NULLVP; } /* * entry is not in the cache or has been expired */ error = 0; *vpp = NULLVP; scred = smbfs_malloc_scred(); smb_makescred(scred, td, cnp->cn_cred); fap = &fattr; if (flags & ISDOTDOT) { /* * In the DOTDOT case, don't go over-the-wire * in order to request attributes. We already * know it's a directory and subsequent call to * smbfs_getattr() will restore consistency. * */ SMBVDEBUG("smbfs_smb_lookup: dotdot\n"); } else if (isdot) { error = smbfs_smb_lookup(dnp, NULL, 0, fap, scred); SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error); } else { error = smbfs_smb_lookup(dnp, name, nmlen, fap, scred); SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error); } if (error && error != ENOENT) goto out; if (error) { /* entry not found */ /* * Handle RENAME or CREATE case... */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) goto out; error = EJUSTRETURN; goto out; } error = ENOENT; goto out; }/* else { SMBVDEBUG("Found entry %s with id=%d\n", fap->entryName, fap->dirEntNum); }*/ /* * handle DELETE case ... */ if (nameiop == DELETE && islastcn) { /* delete last component */ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) goto out; if (isdot) { VREF(dvp); *vpp = dvp; goto out; } error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) goto out; *vpp = vp; goto out; } if (nameiop == RENAME && islastcn) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) goto out; if (isdot) { error = EISDIR; goto out; } error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) goto out; *vpp = vp; goto out; } if (flags & ISDOTDOT) { mp = dvp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(dvp); error = vfs_busy(mp, 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vfs_rel(mp); if (error) { error = ENOENT; goto out; } if (VN_IS_DOOMED(dvp)) { vfs_unbusy(mp); error = ENOENT; goto out; } } VOP_UNLOCK(dvp); error = smbfs_nget(mp, dvp, name, nmlen, NULL, &vp); vfs_unbusy(mp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(vp); error = ENOENT; } if (error) goto out; *vpp = vp; } else if (isdot) { vref(dvp); *vpp = dvp; } else { error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp); if (error) goto out; *vpp = vp; SMBVDEBUG("lookup: getnewvp!\n"); } if ((cnp->cn_flags & MAKEENTRY)/* && !islastcn*/) { /* VTOSMB(*vpp)->n_ctime = VTOSMB(*vpp)->n_vattr.va_ctime.tv_sec;*/ cache_enter(dvp, *vpp, cnp); } out: smbfs_free_scred(scred); return (error); } diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c index a930e3921ab3..03130f0ca949 100644 --- a/sys/fs/unionfs/union_vnops.c +++ b/sys/fs/unionfs/union_vnops.c @@ -1,3047 +1,3047 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry. * Copyright (c) 1992, 1993, 1994, 1995 * The Regents of the University of California. * Copyright (c) 2005, 2006, 2012 Masanori Ozawa , ONGS Inc. * Copyright (c) 2006, 2012 Daichi Goto * All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if 0 #define UNIONFS_INTERNAL_DEBUG(msg, args...) printf(msg, ## args) #define UNIONFS_IDBG_RENAME #else #define UNIONFS_INTERNAL_DEBUG(msg, args...) #endif #define KASSERT_UNIONFS_VNODE(vp) \ VNASSERT(((vp)->v_op == &unionfs_vnodeops), vp, \ ("%s: non-unionfs vnode", __func__)) static bool unionfs_lookup_isroot(struct componentname *cnp, struct vnode *dvp) { struct nameidata *ndp; if (dvp == NULL) return (false); if ((dvp->v_vflag & VV_ROOT) != 0) return (true); ndp = vfs_lookup_nameidata(cnp); if (ndp == NULL) return (false); return (vfs_lookup_isroot(ndp, dvp)); } static int unionfs_lookup(struct vop_cachedlookup_args *ap) { struct unionfs_node *dunp, *unp; struct vnode *dvp, *udvp, *ldvp, *vp, *uvp, *lvp, *dtmpvp; struct vattr va; struct componentname *cnp; struct thread *td; + uint64_t cnflags; u_long nameiop; - u_long cnflags; int lockflag; int lkflags; int error, uerror, lerror; lockflag = 0; error = uerror = lerror = ENOENT; cnp = ap->a_cnp; nameiop = cnp->cn_nameiop; cnflags = cnp->cn_flags; dvp = ap->a_dvp; dunp = VTOUNIONFS(dvp); udvp = dunp->un_uppervp; ldvp = dunp->un_lowervp; vp = uvp = lvp = NULLVP; td = curthread; *(ap->a_vpp) = NULLVP; UNIONFS_INTERNAL_DEBUG( "unionfs_lookup: enter: nameiop=%ld, flags=%lx, path=%s\n", nameiop, cnflags, cnp->cn_nameptr); if (dvp->v_type != VDIR) return (ENOTDIR); /* * If read-only and op is not LOOKUP, will return EROFS. */ if ((cnflags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && LOOKUP != nameiop) return (EROFS); /* * Note that a lookup is in-flight, and block if another lookup * is already in-flight against dvp. This is done because we may * end up dropping dvp's lock to look up a lower vnode or to create * a shadow directory, opening up the possibility of parallel lookups * against the same directory creating duplicate unionfs vnodes for * the same file(s). Note that if this function encounters an * in-progress lookup for the directory, it will block until the * lookup is complete and then return ERELOOKUP to allow any * existing unionfs vnode to be loaded from the VFS cache. * This is really a hack; filesystems that support MNTK_LOOKUP_SHARED * (which unionfs currently doesn't) seem to deal with this by using * the vfs_hash_* functions to manage a per-mount vnode cache keyed * by the inode number (or some roughly equivalent unique ID * usually assocated with the storage medium). It may make sense * for unionfs to adopt something similar as a replacement for its * current half-baked directory-only cache implementation, particularly * if we want to support MNTK_LOOKUP_SHARED here. */ error = unionfs_set_in_progress_flag(dvp, UNIONFS_LOOKUP_IN_PROGRESS); if (error != 0) return (error); /* * lookup dotdot */ if (cnflags & ISDOTDOT) { if (LOOKUP != nameiop && udvp == NULLVP) { error = EROFS; goto unionfs_lookup_return; } if (unionfs_lookup_isroot(cnp, udvp) || unionfs_lookup_isroot(cnp, ldvp)) { error = ENOENT; goto unionfs_lookup_return; } if (udvp != NULLVP) dtmpvp = udvp; else dtmpvp = ldvp; unionfs_forward_vop_start(dtmpvp, &lkflags); error = VOP_LOOKUP(dtmpvp, &vp, cnp); unionfs_forward_vop_finish(dvp, dtmpvp, lkflags); /* * Drop the lock and reference on vp. If the lookup was * successful, we'll either need to exchange vp's lock and * reference for the unionfs parent vnode's lock and * reference, or (if dvp was reclaimed) we'll need to drop * vp's lock and reference to return early. */ if (vp != NULLVP) vput(vp); dunp = VTOUNIONFS(dvp); if (error == 0 && dunp == NULL) error = ENOENT; if (error == 0) { dtmpvp = dunp->un_dvp; vref(dtmpvp); VOP_UNLOCK(dvp); *(ap->a_vpp) = dtmpvp; vn_lock(dtmpvp, cnp->cn_lkflags | LK_RETRY); if (VN_IS_DOOMED(dtmpvp)) { vput(dtmpvp); *(ap->a_vpp) = NULLVP; error = ENOENT; } vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } goto unionfs_lookup_cleanup; } /* * Lookup lower layer. We do this before looking up the the upper * layer, as we may drop the upper parent directory's lock, and we * want to ensure the upper parent remains locked from the point of * lookup through any ensuing VOP that may require it to be locked. * The cost of this is that we may end up performing an unnecessary * lower layer lookup if a whiteout is present in the upper layer. */ if (ldvp != NULLVP && !(cnflags & DOWHITEOUT)) { struct componentname lcn; bool is_dot; if (udvp != NULLVP) { vref(ldvp); VOP_UNLOCK(dvp); vn_lock(ldvp, LK_EXCLUSIVE | LK_RETRY); } lcn = *cnp; /* always op is LOOKUP */ lcn.cn_nameiop = LOOKUP; lcn.cn_flags = cnflags; is_dot = false; if (udvp == NULLVP) unionfs_forward_vop_start(ldvp, &lkflags); lerror = VOP_LOOKUP(ldvp, &lvp, &lcn); if (udvp == NULLVP && unionfs_forward_vop_finish(dvp, ldvp, lkflags)) { if (lvp != NULLVP) VOP_UNLOCK(lvp); error = ENOENT; goto unionfs_lookup_cleanup; } if (udvp == NULLVP) cnp->cn_flags = lcn.cn_flags; if (lerror == 0) { if (ldvp == lvp) { /* is dot */ vrele(lvp); *(ap->a_vpp) = dvp; vref(dvp); is_dot = true; error = lerror; } else if (lvp != NULLVP) VOP_UNLOCK(lvp); } if (udvp != NULLVP) { vput(ldvp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(dvp)) error = ENOENT; } if (is_dot) goto unionfs_lookup_return; else if (error != 0) goto unionfs_lookup_cleanup; } /* * lookup upper layer */ if (udvp != NULLVP) { bool iswhiteout = false; unionfs_forward_vop_start(udvp, &lkflags); uerror = VOP_LOOKUP(udvp, &uvp, cnp); if (unionfs_forward_vop_finish(dvp, udvp, lkflags)) { if (uvp != NULLVP) VOP_UNLOCK(uvp); error = ENOENT; goto unionfs_lookup_cleanup; } if (uerror == 0) { if (udvp == uvp) { /* is dot */ if (lvp != NULLVP) vrele(lvp); vrele(uvp); *(ap->a_vpp) = dvp; vref(dvp); error = uerror; goto unionfs_lookup_return; } else if (uvp != NULLVP) VOP_UNLOCK(uvp); } /* check whiteout */ if ((uerror == ENOENT || uerror == EJUSTRETURN) && (cnp->cn_flags & ISWHITEOUT)) iswhiteout = true; else if (VOP_GETATTR(udvp, &va, cnp->cn_cred) == 0 && (va.va_flags & OPAQUE)) iswhiteout = true; if (iswhiteout && lvp != NULLVP) { vrele(lvp); lvp = NULLVP; } #if 0 UNIONFS_INTERNAL_DEBUG( "unionfs_lookup: debug: whiteout=%d, path=%s\n", iswhiteout, cnp->cn_nameptr); #endif } /* * check lookup result */ if (uvp == NULLVP && lvp == NULLVP) { error = (udvp != NULLVP ? uerror : lerror); goto unionfs_lookup_return; } /* * check vnode type */ if (uvp != NULLVP && lvp != NULLVP && uvp->v_type != lvp->v_type) { vrele(lvp); lvp = NULLVP; } /* * check shadow dir */ if (uerror != 0 && uerror != EJUSTRETURN && udvp != NULLVP && lerror == 0 && lvp != NULLVP && lvp->v_type == VDIR && !(dvp->v_mount->mnt_flag & MNT_RDONLY) && (1 < cnp->cn_namelen || '.' != *(cnp->cn_nameptr))) { /* get unionfs vnode in order to create a new shadow dir. */ error = unionfs_nodeget(dvp->v_mount, NULLVP, lvp, dvp, &vp, cnp); if (error != 0) goto unionfs_lookup_cleanup; if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK)) VOP_UNLOCK(vp); if (LK_EXCLUSIVE != VOP_ISLOCKED(vp)) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); lockflag = 1; } unp = VTOUNIONFS(vp); if (unp == NULL) error = ENOENT; else error = unionfs_mkshadowdir(dvp, vp, cnp, td); if (lockflag != 0) VOP_UNLOCK(vp); if (error != 0) { UNIONFSDEBUG( "unionfs_lookup: Unable to create shadow dir."); if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) vput(vp); else vrele(vp); goto unionfs_lookup_cleanup; } /* * TODO: Since unionfs_mkshadowdir() relocks udvp after * creating the new directory, return ERELOOKUP here? */ if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED) vn_lock(vp, LK_SHARED | LK_RETRY); } /* * get unionfs vnode. */ else { if (uvp != NULLVP) error = uerror; else error = lerror; if (error != 0) goto unionfs_lookup_cleanup; error = unionfs_nodeget(dvp->v_mount, uvp, lvp, dvp, &vp, cnp); if (error != 0) { UNIONFSDEBUG( "unionfs_lookup: Unable to create unionfs vnode."); goto unionfs_lookup_cleanup; } } if (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)) { error = ENOENT; vput(vp); goto unionfs_lookup_cleanup; } *(ap->a_vpp) = vp; if (cnflags & MAKEENTRY) cache_enter(dvp, vp, cnp); unionfs_lookup_cleanup: if (uvp != NULLVP) vrele(uvp); if (lvp != NULLVP) vrele(lvp); if (error == ENOENT && (cnflags & MAKEENTRY) != 0 && !VN_IS_DOOMED(dvp)) cache_enter(dvp, NULLVP, cnp); unionfs_lookup_return: unionfs_clear_in_progress_flag(dvp, UNIONFS_LOOKUP_IN_PROGRESS); UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error); return (error); } static int unionfs_create(struct vop_create_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_create: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { int lkflags; bool vp_created = false; unionfs_forward_vop_start(udvp, &lkflags); error = VOP_CREATE(udvp, &vp, cnp, ap->a_vap); if (error == 0) vp_created = true; if (__predict_false(unionfs_forward_vop_finish(ap->a_dvp, udvp, lkflags)) && error == 0) { error = ENOENT; } if (error == 0) { VOP_UNLOCK(vp); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp); vrele(vp); } else if (vp_created) vput(vp); } UNIONFS_INTERNAL_DEBUG("unionfs_create: leave (%d)\n", error); return (error); } static int unionfs_whiteout(struct vop_whiteout_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EOPNOTSUPP; if (udvp != NULLVP) { int lkflags; switch (ap->a_flags) { case CREATE: case DELETE: case LOOKUP: unionfs_forward_vop_start(udvp, &lkflags); error = VOP_WHITEOUT(udvp, cnp, ap->a_flags); unionfs_forward_vop_finish(ap->a_dvp, udvp, lkflags); break; default: error = EINVAL; break; } } UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: leave (%d)\n", error); return (error); } static int unionfs_mknod(struct vop_mknod_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_mknod: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; udvp = dunp->un_uppervp; error = EROFS; if (udvp != NULLVP) { int lkflags; bool vp_created = false; unionfs_forward_vop_start(udvp, &lkflags); error = VOP_MKNOD(udvp, &vp, cnp, ap->a_vap); if (error == 0) vp_created = true; if (__predict_false(unionfs_forward_vop_finish(ap->a_dvp, udvp, lkflags)) && error == 0) { error = ENOENT; } if (error == 0) { VOP_UNLOCK(vp); error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP, ap->a_dvp, ap->a_vpp, cnp); vrele(vp); } else if (vp_created) vput(vp); } UNIONFS_INTERNAL_DEBUG("unionfs_mknod: leave (%d)\n", error); return (error); } enum unionfs_lkupgrade { UNIONFS_LKUPGRADE_SUCCESS, /* lock successfully upgraded */ UNIONFS_LKUPGRADE_ALREADY, /* lock already held exclusive */ UNIONFS_LKUPGRADE_DOOMED /* lock was upgraded, but vnode reclaimed */ }; static inline enum unionfs_lkupgrade unionfs_upgrade_lock(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) return (UNIONFS_LKUPGRADE_ALREADY); if (vn_lock(vp, LK_UPGRADE) != 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) return (UNIONFS_LKUPGRADE_DOOMED); } return (UNIONFS_LKUPGRADE_SUCCESS); } static inline void unionfs_downgrade_lock(struct vnode *vp, enum unionfs_lkupgrade status) { if (status != UNIONFS_LKUPGRADE_ALREADY) vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } /* * Exchange the default (upper vnode) lock on a unionfs vnode for the lower * vnode lock, in support of operations that require access to the lower vnode * even when an upper vnode is present. We don't use vn_lock_pair() to hold * both vnodes at the same time, primarily because the caller may proceed * to issue VOPs to the lower layer which re-lock or perform other operations * which may not be safe in the presence of a locked vnode from another FS. * Moreover, vn_lock_pair()'s deadlock resolution approach can introduce * additional overhead that isn't necessary on these paths. * * vp must be a locked unionfs vnode; the lock state of this vnode is * returned through *lkflags for later use in unionfs_unlock_lvp(). * * Returns the locked lower vnode, or NULL if the lower vnode (and therefore * also the unionfs vnode above it) has been doomed. */ static struct vnode * unionfs_lock_lvp(struct vnode *vp, int *lkflags) { struct unionfs_node *unp; struct vnode *lvp; unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VOP_UNLOCKED(lvp, __func__); *lkflags = VOP_ISLOCKED(vp); vref(lvp); VOP_UNLOCK(vp); vn_lock(lvp, *lkflags | LK_RETRY); if (VN_IS_DOOMED(lvp)) { vput(lvp); lvp = NULLVP; vn_lock(vp, *lkflags | LK_RETRY); } return (lvp); } /* * Undo a previous call to unionfs_lock_lvp(), restoring the default lock * on the unionfs vnode. This function reloads and returns the vnode * private data for the unionfs vnode, which will be NULL if the unionfs * vnode became doomed while its lock was dropped. The caller must check * for this case. */ static struct unionfs_node * unionfs_unlock_lvp(struct vnode *vp, struct vnode *lvp, int lkflags) { ASSERT_VOP_LOCKED(lvp, __func__); ASSERT_VOP_UNLOCKED(vp, __func__); vput(lvp); vn_lock(vp, lkflags | LK_RETRY); return (VTOUNIONFS(vp)); } static int unionfs_open(struct vop_open_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; struct vnode *targetvp; struct ucred *cred; struct thread *td; int error; int lkflags; enum unionfs_lkupgrade lkstatus; bool lock_lvp, open_lvp; UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; vp = ap->a_vp; targetvp = NULLVP; cred = ap->a_cred; td = ap->a_td; open_lvp = lock_lvp = false; /* * The executable loader path may call this function with vp locked * shared. If the vnode is reclaimed while upgrading, we can't safely * use unp or do anything else unionfs- specific. */ lkstatus = unionfs_upgrade_lock(vp); if (lkstatus == UNIONFS_LKUPGRADE_DOOMED) { error = ENOENT; goto unionfs_open_cleanup; } unp = VTOUNIONFS(vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0 || unsp->uns_upper_opencnt > 0) { /* vnode is already opend. */ if (unsp->uns_upper_opencnt > 0) targetvp = uvp; else targetvp = lvp; if (targetvp == lvp && (ap->a_mode & FWRITE) && lvp->v_type == VREG) targetvp = NULLVP; } if (targetvp == NULLVP) { if (uvp == NULLVP) { if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) { error = unionfs_copyfile(vp, !(ap->a_mode & O_TRUNC), cred, td); if (error != 0) { unp = VTOUNIONFS(vp); goto unionfs_open_abort; } targetvp = uvp = unp->un_uppervp; } else targetvp = lvp; } else targetvp = uvp; } if (targetvp == uvp && uvp->v_type == VDIR && lvp != NULLVP && unsp->uns_lower_opencnt <= 0) open_lvp = true; else if (targetvp == lvp && uvp != NULLVP) lock_lvp = true; if (lock_lvp) { unp = NULL; lvp = unionfs_lock_lvp(vp, &lkflags); if (lvp == NULLVP) { error = ENOENT; goto unionfs_open_abort; } } else unionfs_forward_vop_start(targetvp, &lkflags); error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fp); if (lock_lvp) { unp = unionfs_unlock_lvp(vp, lvp, lkflags); if (unp == NULL && error == 0) error = ENOENT; } else if (unionfs_forward_vop_finish(vp, targetvp, lkflags)) error = error ? error : ENOENT; if (error != 0) goto unionfs_open_abort; if (targetvp == uvp) { if (open_lvp) { unp = NULL; lvp = unionfs_lock_lvp(vp, &lkflags); if (lvp == NULLVP) { error = ENOENT; goto unionfs_open_abort; } /* open lower for readdir */ error = VOP_OPEN(lvp, FREAD, cred, td, NULL); unp = unionfs_unlock_lvp(vp, lvp, lkflags); if (unp == NULL) { error = error ? error : ENOENT; goto unionfs_open_abort; } if (error != 0) { unionfs_forward_vop_start(uvp, &lkflags); VOP_CLOSE(uvp, ap->a_mode, cred, td); if (unionfs_forward_vop_finish(vp, uvp, lkflags)) unp = NULL; goto unionfs_open_abort; } unsp->uns_node_flag |= UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt++; } unsp->uns_upper_opencnt++; } else { unsp->uns_lower_opencnt++; unsp->uns_lower_openmode = ap->a_mode; } vp->v_object = targetvp->v_object; unionfs_open_abort: if (error != 0 && unp != NULL) unionfs_tryrem_node_status(unp, unsp); unionfs_open_cleanup: unionfs_downgrade_lock(vp, lkstatus); UNIONFS_INTERNAL_DEBUG("unionfs_open: leave (%d)\n", error); return (error); } static int unionfs_close(struct vop_close_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct ucred *cred; struct thread *td; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; int error; int lkflags; enum unionfs_lkupgrade lkstatus; bool lock_lvp; UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; cred = ap->a_cred; td = ap->a_td; error = 0; lock_lvp = false; /* * If the vnode is reclaimed while upgrading, we can't safely use unp * or do anything else unionfs- specific. */ lkstatus = unionfs_upgrade_lock(vp); if (lkstatus == UNIONFS_LKUPGRADE_DOOMED) goto unionfs_close_cleanup; unp = VTOUNIONFS(vp); lvp = unp->un_lowervp; uvp = unp->un_uppervp; unsp = unionfs_find_node_status(unp, td); if (unsp == NULL || (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0)) { #ifdef DIAGNOSTIC if (unsp != NULL) printf("unionfs_close: warning: open count is 0\n"); #endif if (uvp != NULLVP) ovp = uvp; else ovp = lvp; } else if (unsp->uns_upper_opencnt > 0) ovp = uvp; else ovp = lvp; if (ovp == lvp && uvp != NULLVP) { lock_lvp = true; unp = NULL; lvp = unionfs_lock_lvp(vp, &lkflags); if (lvp == NULLVP) { error = ENOENT; goto unionfs_close_abort; } } else unionfs_forward_vop_start(ovp, &lkflags); error = VOP_CLOSE(ovp, ap->a_fflag, cred, td); if (lock_lvp) { unp = unionfs_unlock_lvp(vp, lvp, lkflags); if (unp == NULL && error == 0) error = ENOENT; } else if (unionfs_forward_vop_finish(vp, ovp, lkflags)) error = error ? error : ENOENT; if (error != 0) goto unionfs_close_abort; vp->v_object = ovp->v_object; if (ovp == uvp) { if (unsp != NULL && ((--unsp->uns_upper_opencnt) == 0)) { if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) { unp = NULL; lvp = unionfs_lock_lvp(vp, &lkflags); if (lvp == NULLVP) { error = ENOENT; goto unionfs_close_abort; } VOP_CLOSE(lvp, FREAD, cred, td); unp = unionfs_unlock_lvp(vp, lvp, lkflags); if (unp == NULL) { error = ENOENT; goto unionfs_close_abort; } unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR; unsp->uns_lower_opencnt--; } if (unsp->uns_lower_opencnt > 0) vp->v_object = lvp->v_object; } } else if (unsp != NULL) unsp->uns_lower_opencnt--; unionfs_close_abort: if (unp != NULL && unsp != NULL) unionfs_tryrem_node_status(unp, unsp); unionfs_close_cleanup: unionfs_downgrade_lock(vp, lkstatus); UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error); return (error); } /* * Check the access mode toward shadow file/dir. */ static int unionfs_check_corrected_access(accmode_t accmode, struct vattr *va, struct ucred *cred) { uid_t uid; /* upper side vnode's uid */ gid_t gid; /* upper side vnode's gid */ u_short vmode; /* upper side vnode's mode */ u_short mask; mask = 0; uid = va->va_uid; gid = va->va_gid; vmode = va->va_mode; /* check owner */ if (cred->cr_uid == uid) { if (accmode & VEXEC) mask |= S_IXUSR; if (accmode & VREAD) mask |= S_IRUSR; if (accmode & VWRITE) mask |= S_IWUSR; return ((vmode & mask) == mask ? 0 : EACCES); } /* check group */ if (groupmember(gid, cred)) { if (accmode & VEXEC) mask |= S_IXGRP; if (accmode & VREAD) mask |= S_IRGRP; if (accmode & VWRITE) mask |= S_IWGRP; return ((vmode & mask) == mask ? 0 : EACCES); } /* check other */ if (accmode & VEXEC) mask |= S_IXOTH; if (accmode & VREAD) mask |= S_IROTH; if (accmode & VWRITE) mask |= S_IWOTH; return ((vmode & mask) == mask ? 0 : EACCES); } static int unionfs_access(struct vop_access_args *ap) { struct unionfs_mount *ump; struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; accmode_t accmode; int error; UNIONFS_INTERNAL_DEBUG("unionfs_access: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; accmode = ap->a_accmode; error = EACCES; if ((accmode & VWRITE) && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } if (uvp != NULLVP) { error = VOP_ACCESS(uvp, accmode, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } if (lvp != NULLVP) { if (accmode & VWRITE) { if ((ump->um_uppermp->mnt_flag & MNT_RDONLY) != 0) { switch (ap->a_vp->v_type) { case VREG: case VDIR: case VLNK: return (EROFS); default: break; } } else if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { /* check shadow file/dir */ if (ump->um_copymode != UNIONFS_TRANSPARENT) { error = unionfs_create_uppervattr(ump, lvp, &va, ap->a_cred, td); if (error != 0) return (error); error = unionfs_check_corrected_access( accmode, &va, ap->a_cred); if (error != 0) return (error); } } accmode &= ~(VWRITE | VAPPEND); accmode |= VREAD; /* will copy to upper */ } error = VOP_ACCESS(lvp, accmode, ap->a_cred, td); } UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error); return (error); } static int unionfs_getattr(struct vop_getattr_args *ap) { struct unionfs_node *unp; struct unionfs_mount *ump; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; int error; UNIONFS_INTERNAL_DEBUG("unionfs_getattr: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = curthread; if (uvp != NULLVP) { if ((error = VOP_GETATTR(uvp, ap->a_vap, ap->a_cred)) == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG( "unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } error = VOP_GETATTR(lvp, ap->a_vap, ap->a_cred); if (error == 0 && (ump->um_uppermp->mnt_flag & MNT_RDONLY) == 0) { /* correct the attr toward shadow file/dir. */ if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) { unionfs_create_uppervattr_core(ump, ap->a_vap, &va, td); ap->a_vap->va_mode = va.va_mode; ap->a_vap->va_uid = va.va_uid; ap->a_vap->va_gid = va.va_gid; } } if (error == 0) ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; UNIONFS_INTERNAL_DEBUG( "unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n", ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error); return (error); } static int unionfs_setattr(struct vop_setattr_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr *vap; int error; UNIONFS_INTERNAL_DEBUG("unionfs_setattr: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = curthread; vap = ap->a_vap; if ((ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) && (vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { error = unionfs_copyfile(ap->a_vp, (vap->va_size != 0), ap->a_cred, td); if (error != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) { int lkflags; unionfs_forward_vop_start(uvp, &lkflags); error = VOP_SETATTR(uvp, vap, ap->a_cred); unionfs_forward_vop_finish(ap->a_vp, uvp, lkflags); } UNIONFS_INTERNAL_DEBUG("unionfs_setattr: leave (%d)\n", error); return (error); } static int unionfs_read(struct vop_read_args *ap) { struct unionfs_node *unp; struct vnode *tvp; int error; /* UNIONFS_INTERNAL_DEBUG("unionfs_read: enter\n"); */ KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READ(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); /* UNIONFS_INTERNAL_DEBUG("unionfs_read: leave (%d)\n", error); */ return (error); } static int unionfs_write(struct vop_write_args *ap) { struct unionfs_node *unp; struct vnode *tvp; int error; int lkflags; /* UNIONFS_INTERNAL_DEBUG("unionfs_write: enter\n"); */ KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); unionfs_forward_vop_start(tvp, &lkflags); error = VOP_WRITE(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred); unionfs_forward_vop_finish(ap->a_vp, tvp, lkflags); /* UNIONFS_INTERNAL_DEBUG("unionfs_write: leave (%d)\n", error); */ return (error); } static int unionfs_ioctl(struct vop_ioctl_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, unsp); VOP_UNLOCK(ap->a_vp); if (ovp == NULLVP) return (EBADF); error = VOP_IOCTL(ovp, ap->a_command, ap->a_data, ap->a_fflag, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: leave (%d)\n", error); return (error); } static int unionfs_poll(struct vop_poll_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; KASSERT_UNIONFS_VNODE(ap->a_vp); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, unsp); VOP_UNLOCK(ap->a_vp); if (ovp == NULLVP) return (EBADF); return (VOP_POLL(ovp, ap->a_events, ap->a_cred, ap->a_td)); } static int unionfs_fsync(struct vop_fsync_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *ovp; enum unionfs_lkupgrade lkstatus; int error, lkflags; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); lkstatus = unionfs_upgrade_lock(ap->a_vp); if (lkstatus == UNIONFS_LKUPGRADE_DOOMED) { unionfs_downgrade_lock(ap->a_vp, lkstatus); return (ENOENT); } unionfs_get_node_status(unp, ap->a_td, &unsp); ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp); unionfs_tryrem_node_status(unp, unsp); unionfs_downgrade_lock(ap->a_vp, lkstatus); if (ovp == NULLVP) return (EBADF); unionfs_forward_vop_start(ovp, &lkflags); error = VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td); unionfs_forward_vop_finish(ap->a_vp, ovp, lkflags); return (error); } static int unionfs_remove(struct vop_remove_args *ap) { char *path; struct unionfs_node *dunp; struct unionfs_node *unp; struct unionfs_mount *ump; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; struct componentname *cnp; struct thread *td; int error; int pathlen; UNIONFS_INTERNAL_DEBUG("unionfs_remove: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; dunp = VTOUNIONFS(ap->a_dvp); udvp = dunp->un_uppervp; cnp = ap->a_cnp; td = curthread; ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; path = unp->un_path; pathlen = unp->un_pathlen; if (udvp == NULLVP) return (EROFS); if (uvp != NULLVP) { int udvp_lkflags, uvp_lkflags; if (ump == NULL || ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP) cnp->cn_flags |= DOWHITEOUT; unionfs_forward_vop_start_pair(udvp, &udvp_lkflags, uvp, &uvp_lkflags); error = VOP_REMOVE(udvp, uvp, cnp); unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags, ap->a_vp, uvp, uvp_lkflags); } else if (lvp != NULLVP) { error = unionfs_mkwhiteout(ap->a_dvp, ap->a_vp, cnp, td, path, pathlen); } UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error); return (error); } static int unionfs_link(struct vop_link_args *ap) { struct unionfs_node *dunp; struct unionfs_node *unp; struct vnode *udvp; struct vnode *uvp; struct componentname *cnp; struct thread *td; int error; UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_tdvp); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; dunp = VTOUNIONFS(ap->a_tdvp); unp = NULL; udvp = dunp->un_uppervp; uvp = NULLVP; cnp = ap->a_cnp; td = curthread; if (udvp == NULLVP) return (EROFS); unp = VTOUNIONFS(ap->a_vp); if (unp->un_uppervp == NULLVP) { if (ap->a_vp->v_type != VREG) return (EOPNOTSUPP); VOP_UNLOCK(ap->a_tdvp); error = unionfs_copyfile(ap->a_vp, 1, cnp->cn_cred, td); vn_lock(ap->a_tdvp, LK_EXCLUSIVE | LK_RETRY); if (error == 0) error = ERELOOKUP; return (error); } uvp = unp->un_uppervp; if (error == 0) { int udvp_lkflags, uvp_lkflags; unionfs_forward_vop_start_pair(udvp, &udvp_lkflags, uvp, &uvp_lkflags); error = VOP_LINK(udvp, uvp, cnp); unionfs_forward_vop_finish_pair(ap->a_tdvp, udvp, udvp_lkflags, ap->a_vp, uvp, uvp_lkflags); } UNIONFS_INTERNAL_DEBUG("unionfs_link: leave (%d)\n", error); return (error); } static int unionfs_rename(struct vop_rename_args *ap) { struct vnode *fdvp; struct vnode *fvp; struct componentname *fcnp; struct vnode *tdvp; struct vnode *tvp; struct componentname *tcnp; struct thread *td; /* rename target vnodes */ struct vnode *rfdvp; struct vnode *rfvp; struct vnode *rtdvp; struct vnode *rtvp; struct unionfs_node *unp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_rename: enter\n"); error = 0; fdvp = ap->a_fdvp; fvp = ap->a_fvp; fcnp = ap->a_fcnp; tdvp = ap->a_tdvp; tvp = ap->a_tvp; tcnp = ap->a_tcnp; td = curthread; rfdvp = fdvp; rfvp = fvp; rtdvp = tdvp; rtvp = tvp; /* check for cross device rename */ if (fvp->v_mount != tdvp->v_mount || (tvp != NULLVP && fvp->v_mount != tvp->v_mount)) { if (fvp->v_op != &unionfs_vnodeops) error = ENODEV; else error = EXDEV; goto unionfs_rename_abort; } /* Renaming a file to itself has no effect. */ if (fvp == tvp) goto unionfs_rename_abort; KASSERT_UNIONFS_VNODE(tdvp); if (tvp != NULLVP) KASSERT_UNIONFS_VNODE(tvp); if (fdvp != tdvp) VI_LOCK(fdvp); unp = VTOUNIONFS(fdvp); if (unp == NULL) { if (fdvp != tdvp) VI_UNLOCK(fdvp); error = ENOENT; goto unionfs_rename_abort; } #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fdvp=%p, ufdvp=%p, lfdvp=%p\n", fdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; } else { rfdvp = unp->un_uppervp; vref(rfdvp); } if (fdvp != tdvp) VI_UNLOCK(fdvp); if (error != 0) goto unionfs_rename_abort; VI_LOCK(fvp); unp = VTOUNIONFS(fvp); if (unp == NULL) { VI_UNLOCK(fvp); error = ENOENT; goto unionfs_rename_abort; } #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n", fvp, unp->un_uppervp, unp->un_lowervp); #endif /* * If we only have a lower vnode, copy the source file to the upper * FS so that the rename operation can be issued against the upper FS. */ if (unp->un_uppervp == NULLVP) { bool unlock_fdvp = false, relock_tdvp = false; VI_UNLOCK(fvp); if (tvp != NULLVP) VOP_UNLOCK(tvp); if (fvp->v_type == VREG) { /* * For regular files, unionfs_copyfile() will expect * fdvp's upper parent directory vnode to be unlocked * and will temporarily lock it. If fdvp == tdvp, we * should unlock tdvp to avoid recursion on tdvp's * lock. If fdvp != tdvp, we should also unlock tdvp * to avoid potential deadlock due to holding tdvp's * lock while locking unrelated vnodes associated with * fdvp/fvp. */ VOP_UNLOCK(tdvp); relock_tdvp = true; } else if (fvp->v_type == VDIR && tdvp != fdvp) { /* * For directories, unionfs_mkshadowdir() will expect * fdvp's upper parent directory vnode to be locked * and will temporarily unlock it. If fdvp == tdvp, * we can therefore leave tdvp locked. If fdvp != * tdvp, we should exchange the lock on tdvp for a * lock on fdvp. */ VOP_UNLOCK(tdvp); unlock_fdvp = true; relock_tdvp = true; vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); } vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(fvp); if (unp == NULL) error = ENOENT; else if (unp->un_uppervp == NULLVP) { switch (fvp->v_type) { case VREG: error = unionfs_copyfile(fvp, 1, fcnp->cn_cred, td); break; case VDIR: error = unionfs_mkshadowdir(fdvp, fvp, fcnp, td); break; default: error = ENODEV; break; } } VOP_UNLOCK(fvp); if (unlock_fdvp) VOP_UNLOCK(fdvp); if (relock_tdvp) vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); if (tvp != NULLVP) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); /* * Since we've dropped tdvp's lock at some point in the copy * sequence above, force the caller to re-drive the lookup * in case the relationship between tdvp and tvp has changed. */ if (error == 0) error = ERELOOKUP; goto unionfs_rename_abort; } if (unp->un_lowervp != NULLVP) fcnp->cn_flags |= DOWHITEOUT; rfvp = unp->un_uppervp; vref(rfvp); VI_UNLOCK(fvp); unp = VTOUNIONFS(tdvp); #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tdvp=%p, utdvp=%p, ltdvp=%p\n", tdvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) { error = ENODEV; goto unionfs_rename_abort; } rtdvp = unp->un_uppervp; vref(rtdvp); if (tvp != NULLVP) { unp = VTOUNIONFS(tvp); if (unp == NULL) { error = ENOENT; goto unionfs_rename_abort; } #ifdef UNIONFS_IDBG_RENAME UNIONFS_INTERNAL_DEBUG("tvp=%p, utvp=%p, ltvp=%p\n", tvp, unp->un_uppervp, unp->un_lowervp); #endif if (unp->un_uppervp == NULLVP) rtvp = NULLVP; else { if (tvp->v_type == VDIR) { error = EINVAL; goto unionfs_rename_abort; } rtvp = unp->un_uppervp; vref(rtvp); } } if (rfvp == rtvp) goto unionfs_rename_abort; error = VOP_RENAME(rfdvp, rfvp, fcnp, rtdvp, rtvp, tcnp); if (error == 0) { if (rtvp != NULLVP && rtvp->v_type == VDIR) cache_purge(tdvp); if (fvp->v_type == VDIR && fdvp != tdvp) cache_purge(fdvp); } if (tdvp != rtdvp) vrele(tdvp); if (tvp != rtvp && tvp != NULLVP) { if (rtvp == NULLVP) vput(tvp); else vrele(tvp); } if (fdvp != rfdvp) vrele(fdvp); if (fvp != rfvp) vrele(fvp); UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); unionfs_rename_abort: vput(tdvp); if (tdvp != rtdvp) vrele(rtdvp); if (tvp != NULLVP) { if (tdvp != tvp) vput(tvp); else vrele(tvp); } if (tvp != rtvp && rtvp != NULLVP) vrele(rtvp); if (fdvp != rfdvp) vrele(rfdvp); if (fvp != rfvp) vrele(rfvp); vrele(fdvp); vrele(fvp); UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error); return (error); } static int unionfs_mkdir(struct vop_mkdir_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *dvp; struct vnode *udvp; struct vnode *uvp; struct vattr va; int error; int lkflags; UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); error = EROFS; dvp = ap->a_dvp; dunp = VTOUNIONFS(dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; udvp = dunp->un_uppervp; if (udvp != NULLVP) { /* check opaque */ if (!(cnp->cn_flags & ISWHITEOUT)) { error = VOP_GETATTR(udvp, &va, cnp->cn_cred); if (error != 0) goto unionfs_mkdir_cleanup; if ((va.va_flags & OPAQUE) != 0) cnp->cn_flags |= ISWHITEOUT; } int udvp_lkflags; bool uvp_created = false; unionfs_forward_vop_start(udvp, &udvp_lkflags); error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap); if (error == 0) uvp_created = true; if (__predict_false(unionfs_forward_vop_finish(dvp, udvp, udvp_lkflags)) && error == 0) error = ENOENT; if (error == 0) { VOP_UNLOCK(uvp); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(dvp->v_mount, uvp, NULLVP, dvp, ap->a_vpp, cnp); vrele(uvp); cnp->cn_lkflags = lkflags; } else if (uvp_created) vput(uvp); } unionfs_mkdir_cleanup: UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: leave (%d)\n", error); return (error); } static int unionfs_rmdir(struct vop_rmdir_args *ap) { struct unionfs_node *dunp; struct unionfs_node *unp; struct unionfs_mount *ump; struct componentname *cnp; struct thread *td; struct vnode *udvp; struct vnode *uvp; struct vnode *lvp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; dunp = VTOUNIONFS(ap->a_dvp); unp = VTOUNIONFS(ap->a_vp); cnp = ap->a_cnp; td = curthread; udvp = dunp->un_uppervp; uvp = unp->un_uppervp; lvp = unp->un_lowervp; if (udvp == NULLVP) return (EROFS); if (udvp == uvp) return (EOPNOTSUPP); if (uvp != NULLVP) { if (lvp != NULLVP) { /* * We need to keep dvp and vp's upper vnodes locked * going into the VOP_RMDIR() call, but the empty * directory check also requires the lower vnode lock. * For this third, cross-filesystem lock we use a * similar approach taken by various FS' VOP_RENAME * implementations (which require 2-4 vnode locks). * First we attempt a NOWAIT acquisition, then if * that fails we drops the other two vnode locks, * acquire lvp's lock in the normal fashion to reduce * the likelihood of spinning on it in the future, * then drop, reacquire the other locks, and return * ERELOOKUP to re-drive the lookup in case the dvp-> * vp relationship has changed. */ if (vn_lock(lvp, LK_SHARED | LK_NOWAIT) != 0) { VOP_UNLOCK(ap->a_vp); VOP_UNLOCK(ap->a_dvp); vn_lock(lvp, LK_SHARED | LK_RETRY); VOP_UNLOCK(lvp); vn_lock(ap->a_dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY); return (ERELOOKUP); } error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td); /* * It's possible for a direct operation on the lower FS * to make the lower directory non-empty after we drop * the lock, but it's also possible for the upper-layer * VOP_RMDIR to relock udvp/uvp which would lead to * LOR if we kept lvp locked across that call. */ VOP_UNLOCK(lvp); if (error != 0) return (error); } ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount); if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP) cnp->cn_flags |= (DOWHITEOUT | IGNOREWHITEOUT); int udvp_lkflags, uvp_lkflags; unionfs_forward_vop_start_pair(udvp, &udvp_lkflags, uvp, &uvp_lkflags); error = VOP_RMDIR(udvp, uvp, cnp); unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags, ap->a_vp, uvp, uvp_lkflags); } else if (lvp != NULLVP) { error = unionfs_mkwhiteout(ap->a_dvp, ap->a_vp, cnp, td, unp->un_path, unp->un_pathlen); } if (error == 0) { cache_purge(ap->a_dvp); cache_purge(ap->a_vp); } UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: leave (%d)\n", error); return (error); } static int unionfs_symlink(struct vop_symlink_args *ap) { struct unionfs_node *dunp; struct componentname *cnp; struct vnode *udvp; struct vnode *uvp; int error; int lkflags; UNIONFS_INTERNAL_DEBUG("unionfs_symlink: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_dvp); error = EROFS; dunp = VTOUNIONFS(ap->a_dvp); cnp = ap->a_cnp; lkflags = cnp->cn_lkflags; udvp = dunp->un_uppervp; if (udvp != NULLVP) { int udvp_lkflags; bool uvp_created = false; unionfs_forward_vop_start(udvp, &udvp_lkflags); error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target); if (error == 0) uvp_created = true; if (__predict_false(unionfs_forward_vop_finish(ap->a_dvp, udvp, udvp_lkflags)) && error == 0) error = ENOENT; if (error == 0) { VOP_UNLOCK(uvp); cnp->cn_lkflags = LK_EXCLUSIVE; error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP, ap->a_dvp, ap->a_vpp, cnp); vrele(uvp); cnp->cn_lkflags = lkflags; } else if (uvp_created) vput(uvp); } UNIONFS_INTERNAL_DEBUG("unionfs_symlink: leave (%d)\n", error); return (error); } static int unionfs_readdir(struct vop_readdir_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct uio *uio; struct vnode *vp; struct vnode *uvp; struct vnode *lvp; struct thread *td; struct vattr va; uint64_t *cookies_bk; int error; int eofflag; int lkflags; int ncookies_bk; int uio_offset_bk; enum unionfs_lkupgrade lkstatus; UNIONFS_INTERNAL_DEBUG("unionfs_readdir: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = 0; eofflag = 0; uio_offset_bk = 0; uio = ap->a_uio; uvp = NULLVP; lvp = NULLVP; td = uio->uio_td; ncookies_bk = 0; cookies_bk = NULL; vp = ap->a_vp; if (vp->v_type != VDIR) return (ENOTDIR); /* * If the vnode is reclaimed while upgrading, we can't safely use unp * or do anything else unionfs- specific. */ lkstatus = unionfs_upgrade_lock(vp); if (lkstatus == UNIONFS_LKUPGRADE_DOOMED) error = EBADF; if (error == 0) { unp = VTOUNIONFS(vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; /* check the open count. unionfs needs open before readdir. */ unionfs_get_node_status(unp, td, &unsp); if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) || (lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) { unionfs_tryrem_node_status(unp, unsp); error = EBADF; } } unionfs_downgrade_lock(vp, lkstatus); if (error != 0) goto unionfs_readdir_exit; /* check opaque */ if (uvp != NULLVP && lvp != NULLVP) { if ((error = VOP_GETATTR(uvp, &va, ap->a_cred)) != 0) goto unionfs_readdir_exit; if (va.va_flags & OPAQUE) lvp = NULLVP; } /* upper only */ if (uvp != NULLVP && lvp == NULLVP) { unionfs_forward_vop_start(uvp, &lkflags); error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); if (unionfs_forward_vop_finish(vp, uvp, lkflags)) error = error ? error : ENOENT; else unsp->uns_readdir_status = 0; goto unionfs_readdir_exit; } /* lower only */ if (uvp == NULLVP && lvp != NULLVP) { unionfs_forward_vop_start(lvp, &lkflags); error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); if (unionfs_forward_vop_finish(vp, lvp, lkflags)) error = error ? error : ENOENT; else unsp->uns_readdir_status = 2; goto unionfs_readdir_exit; } /* * readdir upper and lower */ KASSERT(uvp != NULLVP, ("unionfs_readdir: null upper vp")); KASSERT(lvp != NULLVP, ("unionfs_readdir: null lower vp")); if (uio->uio_offset == 0) unsp->uns_readdir_status = 0; if (unsp->uns_readdir_status == 0) { /* read upper */ unionfs_forward_vop_start(uvp, &lkflags); error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag, ap->a_ncookies, ap->a_cookies); if (unionfs_forward_vop_finish(vp, uvp, lkflags) && error == 0) error = ENOENT; if (error != 0 || eofflag == 0) goto unionfs_readdir_exit; unsp->uns_readdir_status = 1; /* * UFS(and other FS) needs size of uio_resid larger than * DIRBLKSIZ. * size of DIRBLKSIZ equals DEV_BSIZE. * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h) */ if (uio->uio_resid <= (uio->uio_resid & (DEV_BSIZE -1))) goto unionfs_readdir_exit; /* * Backup cookies. * It prepares to readdir in lower. */ if (ap->a_ncookies != NULL) { ncookies_bk = *(ap->a_ncookies); *(ap->a_ncookies) = 0; } if (ap->a_cookies != NULL) { cookies_bk = *(ap->a_cookies); *(ap->a_cookies) = NULL; } } /* initialize for readdir in lower */ if (unsp->uns_readdir_status == 1) { unsp->uns_readdir_status = 2; /* * Backup uio_offset. See the comment after the * VOP_READDIR call on the lower layer. */ uio_offset_bk = uio->uio_offset; uio->uio_offset = 0; } lvp = unionfs_lock_lvp(vp, &lkflags); if (lvp == NULL) { error = ENOENT; goto unionfs_readdir_exit; } /* read lower */ error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); unp = unionfs_unlock_lvp(vp, lvp, lkflags); if (unp == NULL && error == 0) error = ENOENT; /* * We can't return an uio_offset of 0: this would trigger an * infinite loop, because the next call to unionfs_readdir would * always restart with the upper layer (uio_offset == 0) and * always return some data. * * This happens when the lower layer root directory is removed. * (A root directory deleting of unionfs should not be permitted. * But current VFS can not do it.) */ if (uio->uio_offset == 0) uio->uio_offset = uio_offset_bk; if (cookies_bk != NULL) { /* merge cookies */ int size; uint64_t *newcookies, *pos; size = *(ap->a_ncookies) + ncookies_bk; newcookies = (uint64_t *) malloc(size * sizeof(*newcookies), M_TEMP, M_WAITOK); pos = newcookies; memcpy(pos, cookies_bk, ncookies_bk * sizeof(*newcookies)); pos += ncookies_bk; memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(*newcookies)); free(cookies_bk, M_TEMP); free(*(ap->a_cookies), M_TEMP); *(ap->a_ncookies) = size; *(ap->a_cookies) = newcookies; } unionfs_readdir_exit: if (error != 0 && ap->a_eofflag != NULL) *(ap->a_eofflag) = 1; UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error); return (error); } static int unionfs_readlink(struct vop_readlink_args *ap) { struct unionfs_node *unp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_readlink: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_READLINK(vp, ap->a_uio, ap->a_cred); UNIONFS_INTERNAL_DEBUG("unionfs_readlink: leave (%d)\n", error); return (error); } static int unionfs_getwritemount(struct vop_getwritemount_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *vp, *ovp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: enter\n"); error = 0; vp = ap->a_vp; uvp = NULLVP; VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp != NULL) uvp = unp->un_uppervp; /* * If our node has no upper vnode, check the parent directory. * We may be initiating a write operation that will produce a * new upper vnode through CoW. */ if (uvp == NULLVP && unp != NULL) { ovp = vp; vp = unp->un_dvp; /* * Only the root vnode should have an empty parent, but it * should not have an empty uppervp, so we shouldn't get here. */ VNASSERT(vp != NULL, ovp, ("%s: NULL parent vnode", __func__)); VI_UNLOCK(ovp); VI_LOCK(vp); unp = VTOUNIONFS(vp); if (unp != NULL) uvp = unp->un_uppervp; if (uvp == NULLVP) error = EACCES; } if (uvp != NULLVP) { vholdnz(uvp); VI_UNLOCK(vp); error = VOP_GETWRITEMOUNT(uvp, ap->a_mpp); vdrop(uvp); } else { VI_UNLOCK(vp); *(ap->a_mpp) = NULL; } UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: leave (%d)\n", error); return (error); } static int unionfs_inactive(struct vop_inactive_args *ap) { ap->a_vp->v_object = NULL; vrecycle(ap->a_vp); return (0); } static int unionfs_reclaim(struct vop_reclaim_args *ap) { /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: enter\n"); */ unionfs_noderem(ap->a_vp); /* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: leave\n"); */ return (0); } static int unionfs_print(struct vop_print_args *ap) { struct unionfs_node *unp; /* struct unionfs_node_status *unsp; */ unp = VTOUNIONFS(ap->a_vp); /* unionfs_get_node_status(unp, curthread, &unsp); */ printf("unionfs_vp=%p, uppervp=%p, lowervp=%p\n", ap->a_vp, unp->un_uppervp, unp->un_lowervp); /* printf("unionfs opencnt: uppervp=%d, lowervp=%d\n", unsp->uns_upper_opencnt, unsp->uns_lower_opencnt); */ if (unp->un_uppervp != NULLVP) vn_printf(unp->un_uppervp, "unionfs: upper "); if (unp->un_lowervp != NULLVP) vn_printf(unp->un_lowervp, "unionfs: lower "); return (0); } static int unionfs_lock(struct vop_lock1_args *ap) { struct unionfs_node *unp; struct vnode *vp; struct vnode *tvp; int error; int flags; bool lvp_locked; error = 0; flags = ap->a_flags; vp = ap->a_vp; if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK)) return (VOP_UNLOCK_FLAGS(vp, flags | LK_RELEASE)); unionfs_lock_restart: /* * We currently need the interlock here to ensure we can safely * access the unionfs vnode's private data. We may be able to * eliminate this extra locking by instead using vfs_smr_enter() * and vn_load_v_data_smr() here in conjunction with an SMR UMA * zone for unionfs nodes. */ if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); else flags &= ~LK_INTERLOCK; unp = VTOUNIONFS(vp); if (unp == NULL) { VI_UNLOCK(vp); ap->a_flags = flags; return (vop_stdlock(ap)); } if (unp->un_uppervp != NULL) { tvp = unp->un_uppervp; lvp_locked = false; } else { tvp = unp->un_lowervp; lvp_locked = true; } /* * During unmount, the root vnode lock may be taken recursively, * because it may share the same v_vnlock field as the vnode covered by * the unionfs mount. The covered vnode is locked across VFS_UNMOUNT(), * and the same lock may be taken recursively here during vflush() * issued by unionfs_unmount(). */ if ((flags & LK_TYPE_MASK) == LK_EXCLUSIVE && (vp->v_vflag & VV_ROOT) != 0) flags |= LK_CANRECURSE; vholdnz(tvp); VI_UNLOCK(vp); error = VOP_LOCK(tvp, flags); vdrop(tvp); if (error == 0 && (lvp_locked || VTOUNIONFS(vp) == NULL)) { /* * After dropping the interlock above, there exists a window * in which another thread may acquire the lower vnode lock * and then either doom the unionfs vnode or create an upper * vnode. In either case, we will effectively be holding the * wrong lock, so we must drop the lower vnode lock and * restart the lock operation. * * If unp is not already NULL, we assume that we can safely * access it because we currently hold lvp's lock. * unionfs_noderem() acquires lvp's lock before freeing * the vnode private data, ensuring it can't be concurrently * freed while we are using it here. Likewise, * unionfs_node_update() acquires lvp's lock before installing * an upper vnode. Without those guarantees, we would need to * reacquire the vnode interlock here. * Note that unionfs_noderem() doesn't acquire lvp's lock if * this is the root vnode, but the root vnode should always * have an upper vnode and therefore we should never use its * lower vnode lock here. */ unp = VTOUNIONFS(vp); if (unp == NULL || unp->un_uppervp != NULLVP) { VOP_UNLOCK(tvp); /* * If we previously held the lock, the upgrade may * have temporarily dropped the lock, in which case * concurrent dooming or copy-up will necessitate * acquiring a different lock. Since we never held * the new lock, LK_UPGRADE must be cleared here to * avoid triggering a lockmgr panic. */ if (flags & LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; VNASSERT((flags & LK_DOWNGRADE) == 0, vp, ("%s: vnode doomed during downgrade", __func__)); goto unionfs_lock_restart; } } return (error); } static int unionfs_unlock(struct vop_unlock_args *ap) { struct vnode *vp; struct vnode *tvp; struct unionfs_node *unp; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; unp = VTOUNIONFS(vp); if (unp == NULL) return (vop_stdunlock(ap)); tvp = (unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp); vholdnz(tvp); error = VOP_UNLOCK(tvp); vdrop(tvp); return (error); } static int unionfs_pathconf(struct vop_pathconf_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); return (VOP_PATHCONF(vp, ap->a_name, ap->a_retval)); } static int unionfs_advlock(struct vop_advlock_args *ap) { struct unionfs_node *unp; struct unionfs_node_status *unsp; struct vnode *vp; struct vnode *uvp; struct thread *td; int error; UNIONFS_INTERNAL_DEBUG("unionfs_advlock: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; td = curthread; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; if (uvp == NULLVP) { error = unionfs_copyfile(ap->a_vp, 1, td->td_ucred, td); if (error != 0) goto unionfs_advlock_abort; uvp = unp->un_uppervp; unionfs_get_node_status(unp, td, &unsp); if (unsp->uns_lower_opencnt > 0) { /* try reopen the vnode */ error = VOP_OPEN(uvp, unsp->uns_lower_openmode, td->td_ucred, td, NULL); if (error) goto unionfs_advlock_abort; unsp->uns_upper_opencnt++; VOP_CLOSE(unp->un_lowervp, unsp->uns_lower_openmode, td->td_ucred, td); unsp->uns_lower_opencnt--; } else unionfs_tryrem_node_status(unp, unsp); } VOP_UNLOCK(vp); error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; unionfs_advlock_abort: VOP_UNLOCK(vp); UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error); return error; } static int unionfs_strategy(struct vop_strategy_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); #ifdef DIAGNOSTIC if (vp == NULLVP) panic("unionfs_strategy: nullvp"); if (ap->a_bp->b_iocmd == BIO_WRITE && vp == unp->un_lowervp) panic("unionfs_strategy: writing to lowervp"); #endif return (VOP_STRATEGY(vp, ap->a_bp)); } static int unionfs_getacl(struct vop_getacl_args *ap) { struct unionfs_node *unp; struct vnode *vp; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: enter\n"); error = VOP_GETACL(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_getacl: leave (%d)\n", error); return (error); } static int unionfs_setacl(struct vop_setacl_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; int error; UNIONFS_INTERNAL_DEBUG("unionfs_setacl: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(ap->a_vp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) { int lkflags; unionfs_forward_vop_start(uvp, &lkflags); error = VOP_SETACL(uvp, ap->a_type, ap->a_aclp, ap->a_cred, td); unionfs_forward_vop_finish(ap->a_vp, uvp, lkflags); } UNIONFS_INTERNAL_DEBUG("unionfs_setacl: leave (%d)\n", error); return (error); } static int unionfs_aclcheck(struct vop_aclcheck_args *ap) { struct unionfs_node *unp; struct vnode *vp; int error; UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); error = VOP_ACLCHECK(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td); UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: leave (%d)\n", error); return (error); } static int unionfs_openextattr(struct vop_openextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; struct vnode *tvp; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); if ((tvp == unp->un_uppervp && (unp->un_flag & UNIONFS_OPENEXTU)) || (tvp == unp->un_lowervp && (unp->un_flag & UNIONFS_OPENEXTL))) return (EBUSY); error = VOP_OPENEXTATTR(tvp, ap->a_cred, ap->a_td); if (error == 0) { if (vn_lock(vp, LK_UPGRADE) != 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (!VN_IS_DOOMED(vp)) { if (tvp == unp->un_uppervp) unp->un_flag |= UNIONFS_OPENEXTU; else unp->un_flag |= UNIONFS_OPENEXTL; } vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } return (error); } static int unionfs_closeextattr(struct vop_closeextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; struct vnode *tvp; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) tvp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) tvp = unp->un_lowervp; if (tvp == NULLVP) return (EOPNOTSUPP); error = VOP_CLOSEEXTATTR(tvp, ap->a_commit, ap->a_cred, ap->a_td); if (error == 0) { if (vn_lock(vp, LK_UPGRADE) != 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (!VN_IS_DOOMED(vp)) { if (tvp == unp->un_uppervp) unp->un_flag &= ~UNIONFS_OPENEXTU; else unp->un_flag &= ~UNIONFS_OPENEXTL; } vn_lock(vp, LK_DOWNGRADE | LK_RETRY); } return (error); } static int unionfs_getextattr(struct vop_getextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_GETEXTATTR(vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_setextattr(struct vop_setextattr_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(ap->a_vp, 1, cred, td)) != 0) { unionfs_setextattr_reopen: unp = VTOUNIONFS(ap->a_vp); if (unp != NULL && (unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_setextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_setextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) { int lkflags; unionfs_forward_vop_start(ovp, &lkflags); error = VOP_SETEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_uio, cred, td); unionfs_forward_vop_finish(ap->a_vp, ovp, lkflags); } unionfs_setextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: leave (%d)\n", error); return (error); } static int unionfs_listextattr(struct vop_listextattr_args *ap) { struct unionfs_node *unp; struct vnode *vp; KASSERT_UNIONFS_VNODE(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); vp = NULLVP; if (unp->un_flag & UNIONFS_OPENEXTU) vp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) vp = unp->un_lowervp; if (vp == NULLVP) return (EOPNOTSUPP); return (VOP_LISTEXTATTR(vp, ap->a_attrnamespace, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td)); } static int unionfs_deleteextattr(struct vop_deleteextattr_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct vnode *ovp; struct ucred *cred; struct thread *td; int error; KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; ovp = NULLVP; cred = ap->a_cred; td = ap->a_td; UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: enter (un_flag=%x)\n", unp->un_flag); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (unp->un_flag & UNIONFS_OPENEXTU) ovp = unp->un_uppervp; else if (unp->un_flag & UNIONFS_OPENEXTL) ovp = unp->un_lowervp; if (ovp == NULLVP) return (EOPNOTSUPP); if (ovp == lvp && lvp->v_type == VREG) { VOP_CLOSEEXTATTR(lvp, 0, cred, td); if (uvp == NULLVP && (error = unionfs_copyfile(ap->a_vp, 1, cred, td)) != 0) { unionfs_deleteextattr_reopen: unp = VTOUNIONFS(ap->a_vp); if (unp != NULL && (unp->un_flag & UNIONFS_OPENEXTL) && VOP_OPENEXTATTR(lvp, cred, td)) { #ifdef DIAGNOSTIC panic("unionfs: VOP_OPENEXTATTR failed"); #endif unp->un_flag &= ~UNIONFS_OPENEXTL; } goto unionfs_deleteextattr_abort; } uvp = unp->un_uppervp; if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0) goto unionfs_deleteextattr_reopen; unp->un_flag &= ~UNIONFS_OPENEXTL; unp->un_flag |= UNIONFS_OPENEXTU; ovp = uvp; } if (ovp == uvp) error = VOP_DELETEEXTATTR(ovp, ap->a_attrnamespace, ap->a_name, ap->a_cred, ap->a_td); unionfs_deleteextattr_abort: UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: leave (%d)\n", error); return (error); } static int unionfs_setlabel(struct vop_setlabel_args *ap) { struct unionfs_node *unp; struct vnode *uvp; struct vnode *lvp; struct thread *td; int error; UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: enter\n"); KASSERT_UNIONFS_VNODE(ap->a_vp); error = EROFS; unp = VTOUNIONFS(ap->a_vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; td = ap->a_td; if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (uvp == NULLVP && lvp->v_type == VREG) { if ((error = unionfs_copyfile(ap->a_vp, 1, ap->a_cred, td)) != 0) return (error); uvp = unp->un_uppervp; } if (uvp != NULLVP) error = VOP_SETLABEL(uvp, ap->a_label, ap->a_cred, td); UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: leave (%d)\n", error); return (error); } static int unionfs_vptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } static int unionfs_add_writecount(struct vop_add_writecount_args *ap) { struct vnode *tvp, *vp; struct unionfs_node *unp; int error, writerefs __diagused; vp = ap->a_vp; unp = VTOUNIONFS(vp); tvp = unp->un_uppervp; KASSERT(tvp != NULL, ("%s: adding write ref without upper vnode", __func__)); error = VOP_ADD_WRITECOUNT(tvp, ap->a_inc); if (error != 0) return (error); /* * We need to track the write refs we've passed to the underlying * vnodes so that we can undo them in case we are forcibly unmounted. */ writerefs = atomic_fetchadd_int(&vp->v_writecount, ap->a_inc); /* text refs are bypassed to lowervp */ VNASSERT(writerefs >= 0, vp, ("%s: invalid write count %d", __func__, writerefs)); VNASSERT(writerefs + ap->a_inc >= 0, vp, ("%s: invalid write count inc %d + %d", __func__, writerefs, ap->a_inc)); return (0); } static int unionfs_vput_pair(struct vop_vput_pair_args *ap) { struct mount *mp; struct vnode *dvp, *vp, **vpp, *lvp, *uvp, *tvp, *tdvp, *tempvp; struct unionfs_node *dunp, *unp; int error, res; dvp = ap->a_dvp; vpp = ap->a_vpp; vp = NULLVP; lvp = NULLVP; uvp = NULLVP; tvp = NULLVP; unp = NULL; dunp = VTOUNIONFS(dvp); if (dunp->un_uppervp != NULL) tdvp = dunp->un_uppervp; else tdvp = dunp->un_lowervp; /* * Underlying vnodes should be locked because the encompassing unionfs * node is locked, but will not be referenced, as the reference will * only be on the unionfs node. Reference them now so that the vput()s * performed by VOP_VPUT_PAIR() will have a reference to drop. */ vref(tdvp); if (vpp != NULL) vp = *vpp; if (vp != NULLVP) { unp = VTOUNIONFS(vp); uvp = unp->un_uppervp; lvp = unp->un_lowervp; if (uvp != NULLVP) tvp = uvp; else tvp = lvp; vref(tvp); /* * If we're being asked to return a locked child vnode, then * we may need to create a replacement vnode in case the * original is reclaimed while the lock is dropped. In that * case we'll need to ensure the mount and the underlying * vnodes aren't also recycled during that window. */ if (!ap->a_unlock_vp) { vhold(vp); if (uvp != NULLVP) vhold(uvp); if (lvp != NULLVP) vhold(lvp); mp = vp->v_mount; vfs_ref(mp); } } ASSERT_VOP_LOCKED(tdvp, __func__); ASSERT_VOP_LOCKED(tvp, __func__); if (tdvp == dunp->un_uppervp && tvp != NULLVP && tvp == lvp) { vput(tvp); vput(tdvp); res = 0; } else { res = VOP_VPUT_PAIR(tdvp, tvp != NULLVP ? &tvp : NULL, true); } ASSERT_VOP_UNLOCKED(tdvp, __func__); ASSERT_VOP_UNLOCKED(tvp, __func__); /* * VOP_VPUT_PAIR() dropped the references we added to the underlying * vnodes, now drop the caller's reference to the unionfs vnodes. */ if (vp != NULLVP && ap->a_unlock_vp) vrele(vp); vrele(dvp); if (vp == NULLVP || ap->a_unlock_vp) return (res); /* * We're being asked to return a locked vnode. At this point, the * underlying vnodes have been unlocked, so vp may have been reclaimed. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_data == NULL && vfs_busy(mp, MBF_NOWAIT) == 0) { vput(vp); error = unionfs_nodeget(mp, uvp, lvp, dvp, &tempvp, NULL); if (error == 0) { vn_lock(tempvp, LK_EXCLUSIVE | LK_RETRY); *vpp = tempvp; } else vget(vp, LK_EXCLUSIVE | LK_RETRY); vfs_unbusy(mp); } if (lvp != NULLVP) vdrop(lvp); if (uvp != NULLVP) vdrop(uvp); vdrop(vp); vfs_rel(mp); return (res); } static int unionfs_set_text(struct vop_set_text_args *ap) { struct vnode *tvp; struct unionfs_node *unp; int error; /* * We assume text refs are managed against lvp/uvp through the * executable mapping backed by its VM object. We therefore don't * need to track leased text refs in the case of a forcible unmount. */ unp = VTOUNIONFS(ap->a_vp); ASSERT_VOP_LOCKED(ap->a_vp, __func__); tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp; error = VOP_SET_TEXT(tvp); return (error); } static int unionfs_unset_text(struct vop_unset_text_args *ap) { struct vnode *tvp; struct unionfs_node *unp; ASSERT_VOP_LOCKED(ap->a_vp, __func__); unp = VTOUNIONFS(ap->a_vp); tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp; VOP_UNSET_TEXT_CHECKED(tvp); return (0); } static int unionfs_unp_bind(struct vop_unp_bind_args *ap) { struct vnode *tvp; struct unionfs_node *unp; ASSERT_VOP_LOCKED(ap->a_vp, __func__); unp = VTOUNIONFS(ap->a_vp); tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp; VOP_UNP_BIND(tvp, ap->a_unpcb); return (0); } static int unionfs_unp_connect(struct vop_unp_connect_args *ap) { struct vnode *tvp; struct unionfs_node *unp; ASSERT_VOP_LOCKED(ap->a_vp, __func__); unp = VTOUNIONFS(ap->a_vp); tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp; VOP_UNP_CONNECT(tvp, ap->a_unpcb); return (0); } static int unionfs_unp_detach(struct vop_unp_detach_args *ap) { struct vnode *tvp; struct unionfs_node *unp; tvp = NULL; /* * VOP_UNP_DETACH() is not guaranteed to be called with the unionfs * vnode locked, so we take the interlock to prevent a concurrent * unmount from freeing the unionfs private data. */ VI_LOCK(ap->a_vp); unp = VTOUNIONFS(ap->a_vp); if (unp != NULL) { tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp; /* * Hold the target vnode to prevent a concurrent unionfs * unmount from causing it to be recycled once the interlock * is dropped. */ vholdnz(tvp); } VI_UNLOCK(ap->a_vp); if (tvp != NULL) { VOP_UNP_DETACH(tvp); vdrop(tvp); } return (0); } struct vop_vector unionfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = unionfs_access, .vop_aclcheck = unionfs_aclcheck, .vop_advlock = unionfs_advlock, .vop_bmap = VOP_EOPNOTSUPP, .vop_cachedlookup = unionfs_lookup, .vop_close = unionfs_close, .vop_closeextattr = unionfs_closeextattr, .vop_create = unionfs_create, .vop_deleteextattr = unionfs_deleteextattr, .vop_fsync = unionfs_fsync, .vop_getacl = unionfs_getacl, .vop_getattr = unionfs_getattr, .vop_getextattr = unionfs_getextattr, .vop_getwritemount = unionfs_getwritemount, .vop_inactive = unionfs_inactive, .vop_need_inactive = vop_stdneed_inactive, .vop_islocked = vop_stdislocked, .vop_ioctl = unionfs_ioctl, .vop_link = unionfs_link, .vop_listextattr = unionfs_listextattr, .vop_lock1 = unionfs_lock, .vop_lookup = vfs_cache_lookup, .vop_mkdir = unionfs_mkdir, .vop_mknod = unionfs_mknod, .vop_open = unionfs_open, .vop_openextattr = unionfs_openextattr, .vop_pathconf = unionfs_pathconf, .vop_poll = unionfs_poll, .vop_print = unionfs_print, .vop_read = unionfs_read, .vop_readdir = unionfs_readdir, .vop_readlink = unionfs_readlink, .vop_reclaim = unionfs_reclaim, .vop_remove = unionfs_remove, .vop_rename = unionfs_rename, .vop_rmdir = unionfs_rmdir, .vop_setacl = unionfs_setacl, .vop_setattr = unionfs_setattr, .vop_setextattr = unionfs_setextattr, .vop_setlabel = unionfs_setlabel, .vop_strategy = unionfs_strategy, .vop_symlink = unionfs_symlink, .vop_unlock = unionfs_unlock, .vop_whiteout = unionfs_whiteout, .vop_write = unionfs_write, .vop_vptofh = unionfs_vptofh, .vop_add_writecount = unionfs_add_writecount, .vop_vput_pair = unionfs_vput_pair, .vop_set_text = unionfs_set_text, .vop_unset_text = unionfs_unset_text, .vop_unp_bind = unionfs_unp_bind, .vop_unp_connect = unionfs_unp_connect, .vop_unp_detach = unionfs_unp_detach, }; VFS_VOP_VECTOR_REGISTER(unionfs_vnodeops); diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index d647c7b991f4..6f2760635bad 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -1,2953 +1,2954 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time message queue"); static int default_maxmsg = 10; SYSCTL_INT(_kern_mqueue, OID_AUTO, default_maxmsg, CTLFLAG_RD, &default_maxmsg, 0, "Default maximum messages in queue"); static int default_msgsize = 1024; SYSCTL_INT(_kern_mqueue, OID_AUTO, default_msgsize, CTLFLAG_RD, &default_msgsize, 0, "Default maximum message size"); static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static const struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); static const struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; static const struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return (uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO)); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_add_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 1; sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); vn_set_state(*vpp, VSTATE_CONSTRUCTED); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; - int nameiop, flags, error, namelen; + uint64_t flags; + int nameiop, error, namelen; char *pname; struct thread *td; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); uma_zfree(mvdata_zone, vd); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if (vap->va_type != VNON || vap->va_nlink != VNOVAL || vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || vap->va_blocksize != VNOVAL || (vap->va_flags != VNOVAL && vap->va_flags != 0) || vap->va_rdev != VNOVAL || (int)vap->va_bytes != VNOVAL || vap->va_gen != VNOVAL) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if ((ap->a_cred->cr_uid != pn->mn_uid || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != pn->mn_uid && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; uint64_t **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; entry.d_off = offset + entry.d_reclen; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } dirent_terminate(&entry); if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; struct prison *tpr; struct mqfs_node *pn, *tpn; struct vnode *pr_root; pr_root = pr->pr_root; if (pr->pr_parent->pr_root == pr_root) return (0); TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr != pr && tpr->pr_root == pr_root) return (0); } /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char *path, pathbuf[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct pwddesc *pdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); pdp = td->td_proc->p_pd; cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT; mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } path = pathbuf; error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name may be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || strchr(path + 1, '/') != NULL) return (EINVAL); if (path[0] == '/') { path++; len--; } /* * "." and ".." are magic directories, populated on the fly, and cannot * be opened as queues. */ if (strcmp(path, ".") == 0 || strcmp(path, "..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(pathbuf); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path, len, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path, len, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char *path, pathbuf[MQFS_NAMELEN + 1]; struct mqfs_node *pn; int error, len; path = pathbuf; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || strchr(path + 1, '/') != NULL) return (EINVAL); if (path[0] == '/') { path++; len--; } if (strcmp(path, ".") == 0 || strcmp(path, "..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(pathbuf); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path, len, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, const cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, const cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_event_rights, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_read_rights, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_write_rights, fget_write, fpp, ppn, pmq); } int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int kern_kmq_timedreceive(struct thread *td, int mqd, char *msg_ptr, size_t msg_len, unsigned int *msg_prio, const struct timespec *abs_timeout) { struct mqueue *mq; struct file *fp; int error, waitok; AUDIT_ARG_FD(mqd); error = getmq_read(td, mqd, &fp, NULL, &mq); if (error != 0) return (error); waitok = (fp->f_flag & O_NONBLOCK) == 0; error = mqueue_receive(mq, msg_ptr, msg_len, msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct timespec *abs_timeout, ets; int error; if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; return (kern_kmq_timedreceive(td, uap->mqd, uap->msg_ptr, uap->msg_len, uap->msg_prio, abs_timeout)); } int kern_kmq_timedsend(struct thread *td, int mqd, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout) { struct mqueue *mq; struct file *fp; int error, waitok; AUDIT_ARG_FD(mqd); error = getmq_write(td, mqd, &fp, NULL, &mq); if (error != 0) return (error); waitok = (fp->f_flag & O_NONBLOCK) == 0; error = mqueue_send(mq, msg_ptr, msg_len, msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct timespec *abs_timeout, ets; int error; if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; return (kern_kmq_timedsend(td, uap->mqd, uap->msg_ptr, uap->msg_len, uap->msg_prio, abs_timeout)); } int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_noref(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights(fdp, mqd), &cap_event_rights); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct mqueue *mq; #ifdef INVARIANTS struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); #endif if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_noref(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } static const struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops); static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; return (kern_kmq_timedsend(td, uap->mqd, uap->msg_ptr, uap->msg_len, uap->msg_prio, abs_timeout)); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; return (kern_kmq_timedreceive(td, uap->mqd, uap->msg_ptr, uap->msg_len, uap->msg_prio, abs_timeout)); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index af2d790b2e5b..4ab00698b311 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -1,6376 +1,6376 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef INVARIANTS #include #endif #include #include #ifdef DDB #include #endif #include /* * High level overview of name caching in the VFS layer. * * Originally caching was implemented as part of UFS, later extracted to allow * use by other filesystems. A decision was made to make it optional and * completely detached from the rest of the kernel, which comes with limitations * outlined near the end of this comment block. * * This fundamental choice needs to be revisited. In the meantime, the current * state is described below. Significance of all notable routines is explained * in comments placed above their implementation. Scattered thoroughout the * file are TODO comments indicating shortcomings which can be fixed without * reworking everything (most of the fixes will likely be reusable). Various * details are omitted from this explanation to not clutter the overview, they * have to be checked by reading the code and associated commentary. * * Keep in mind that it's individual path components which are cached, not full * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, * one for each name. * * I. Data organization * * Entries are described by "struct namecache" objects and stored in a hash * table. See cache_get_hash for more information. * * "struct vnode" contains pointers to source entries (names which can be found * when traversing through said vnode), destination entries (names of that * vnode (see "Limitations" for a breakdown on the subject) and a pointer to * the parent vnode. * * The (directory vnode; name) tuple reliably determines the target entry if * it exists. * * Since there are no small locks at this time (all are 32 bytes in size on * LP64), the code works around the problem by introducing lock arrays to * protect hash buckets and vnode lists. * * II. Filesystem integration * * Filesystems participating in name caching do the following: * - set vop_lookup routine to vfs_cache_lookup * - set vop_cachedlookup to whatever can perform the lookup if the above fails * - if they support lockless lookup (see below), vop_fplookup_vexec and * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the * mount point * - call cache_purge or cache_vop_* routines to eliminate stale entries as * applicable * - call cache_enter to add entries depending on the MAKEENTRY flag * * With the above in mind, there are 2 entry points when doing lookups: * - ... -> namei -> cache_fplookup -- this is the default * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei * should the above fail * * Example code flow how an entry is added: * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter * * III. Performance considerations * * For lockless case forward lookup avoids any writes to shared areas apart * from the terminal path component. In other words non-modifying lookups of * different files don't suffer any scalability problems in the namecache. * Looking up the same file is limited by VFS and goes beyond the scope of this * file. * * At least on amd64 the single-threaded bottleneck for long paths is hashing * (see cache_get_hash). There are cases where the code issues acquire fence * multiple times, they can be combined on architectures which suffer from it. * * For locked case each encountered vnode has to be referenced and locked in * order to be handed out to the caller (normally that's namei). This * introduces significant hit single-threaded and serialization multi-threaded. * * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- * avoids any writes to shared areas to any components. * * Unrelated insertions are partially serialized on updating the global entry * counter and possibly serialized on colliding bucket or vnode locks. * * IV. Observability * * Note not everything has an explicit dtrace probe nor it should have, thus * some of the one-liners below depend on implementation details. * * Examples: * * # Check what lookups failed to be handled in a lockless manner. Column 1 is * # line number, column 2 is status code (see cache_fpl_status) * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' * * # Lengths of names added by binary name * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' * * # Same as above but only those which exceed 64 characters * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' * * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what * # path is it * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' * * V. Limitations and implementation defects * * - since it is possible there is no entry for an open file, tools like * "procstat" may fail to resolve fd -> vnode -> path to anything * - even if a filesystem adds an entry, it may get purged (e.g., due to memory * shortage) in which case the above problem applies * - hardlinks are not tracked, thus if a vnode is reachable in more than one * way, resolving a name may return a different path than the one used to * open it (even if said path is still valid) * - by default entries are not added for newly created files * - adding an entry may need to evict negative entry first, which happens in 2 * distinct places (evicting on lookup, adding in a later VOP) making it * impossible to simply reuse it * - there is a simple scheme to evict negative entries as the cache is approaching * its capacity, but it is very unclear if doing so is a good idea to begin with * - vnodes are subject to being recycled even if target inode is left in memory, * which loses the name cache entries when it perhaps should not. in case of tmpfs * names get duplicated -- kept by filesystem itself and namecache separately * - struct namecache has a fixed size and comes in 2 variants, often wasting * space. now hard to replace with malloc due to dependence on SMR, which * requires UMA zones to opt in * - lack of better integration with the kernel also turns nullfs into a layered * filesystem instead of something which can take advantage of caching * * Appendix A: where is the time lost, expanding on paragraph III * * While some care went into optimizing lookups, there is still plenty of * performance left on the table, most notably from single-threaded standpoint. * Below is a woefully incomplete list of changes which can help. Ideas are * mostly sketched out, no claim is made all kinks or prerequisites are laid * out. * * Note there is performance lost all over VFS. * * === SMR-only lookup * * For commonly used ops like stat(2), when the terminal vnode *is* cached, * lockless lookup could refrain from refing/locking the found vnode and * instead return while within the SMR section. Then a call to, say, * vop_stat_smr could do the work (or fail with EAGAIN), finally the result * would be validated with seqc not changing. This would be faster * single-threaded as it dodges atomics and would provide full scalability for * multicore uses. This would *not* work for open(2) or other calls which need * the vnode to hang around for the long haul, but would work for aforementioned * stat(2) but also access(2), readlink(2), realpathat(2) and probably more. * * === hotpatching for sdt probes * * They result in *tons* of branches all over with rather regrettable codegen * at times. Removing sdt probes altogether gives over 2% boost in lookup rate. * Reworking the code to patch itself at runtime with asm goto would solve it. * asm goto is fully supported by gcc and clang. * * === copyinstr * * On all architectures it operates one byte at a time, while it could be * word-sized instead thanks to the Mycroft trick. * * API itself is rather pessimal for path lookup, accepting arbitrary sizes and * *optionally* filling in the length parameter. * * Instead a new routine (copyinpath?) could be introduced, demanding a buffer * size which is a multiply of the word (and never zero), with the length * always returned. On top of it the routine could be allowed to transform the * buffer in arbitrary ways, most notably writing past the found length (not to * be confused with writing past buffer size) -- this would allow word-sized * movs while checking for '\0' later. * * === detour through namei * * Currently one suffers being called from namei, which then has to check if * things worked out locklessly. Instead the lockless lookup could be the * actual entry point which calls what is currently namei as a fallback. * * === avoidable branches in cache_can_fplookup * * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if * this is off, none of fplookup code should execute). * * Both audit and capsicum branches can be combined into one, but it requires * paying off a lot of tech debt first. * * ni_startdir could be indicated with a flag in cn_flags, eliminating the * branch. * * === mount stacks * * Crossing a mount requires checking if perhaps something is mounted on top. * Instead, an additional entry could be added to struct mount with a pointer * to the final mount on the stack. This would be recalculated on each * mount/unmount. * * === root vnodes * * It could become part of the API contract to *always* have a rootvnode set in * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have * to be modified to always skip them. * * === inactive on v_usecount reaching 0 * * VOP_NEED_INACTIVE should not exist. Filesystems would indicate need for such * processing with a bit in usecount. * * === v_holdcnt * * Hold count should probably get eliminated, but one can argue it is a useful * feature. Even if so, handling of v_usecount could be decoupled from it -- * vnlru et al would consider the vnode not-freeable if has either hold or * usecount on it. * * This would eliminate 2 atomics. */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache"); SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", "const char *"); SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", "struct namecache *", "int", "int"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); static char __read_frequently cache_fast_lookup_enabled = true; /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct negstate { u_char neg_flag; u_char neg_hit; }; _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), "the state must fit in a union with a pointer without growing it"); struct namecache { LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ struct vnode *nc_dvp; /* vnode of parent of name */ union { struct vnode *nu_vp; /* vnode the name refers to */ struct negstate nu_neg;/* negative entry state */ } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. * * See below for alignment requirement. */ struct namecache_ts { struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ int nc_pad; struct namecache nc_nc; }; TAILQ_HEAD(cache_freebatch, namecache); /* * At least mips n32 performs 64-bit accesses to timespec as found * in namecache_ts and requires them to be aligned. Since others * may be in the same spot suffer a little bit and enforce the * alignment for everyone. Note this is a nop for 64-bit platforms. */ #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) /* * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the * 4.4 BSD codebase. Later on struct namecache was tweaked to become * smaller and the value was bumped to retain the total size, but it * was never re-evaluated for suitability. A simple test counting * lengths during package building shows that the value of 45 covers * about 86% of all added entries, reaching 99% at 65. * * Regardless of the above, use of dedicated zones instead of malloc may be * inducing additional waste. This may be hard to address as said zones are * tied to VFS SMR. Even if retaining them, the current split should be * re-evaluated. */ #ifdef __LP64__ #define CACHE_PATH_CUTOFF 45 #define CACHE_LARGE_PAD 6 #else #define CACHE_PATH_CUTOFF 41 #define CACHE_LARGE_PAD 2 #endif #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); #define nc_vp n_un.nu_vp #define nc_neg n_un.nu_neg /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_INVALID 0x40 #define NCF_WIP 0x80 /* * Flags in negstate.neg_flag */ #define NEG_HOT 0x01 static bool cache_neg_evict_cond(u_long lnumcache); /* * Mark an entry as invalid. * * This is called before it starts getting deconstructed. */ static void cache_ncp_invalidate(struct namecache *ncp) { KASSERT((ncp->nc_flag & NCF_INVALID) == 0, ("%s: entry %p already invalid", __func__, ncp)); atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); atomic_thread_fence_rel(); } /* * Does this entry match the given directory and name? */ static bool cache_ncp_match(struct namecache *ncp, struct vnode *dvp, struct componentname *cnp) { return (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && bcmp(ncp->nc_name, cnp->cn_nameptr, cnp->cn_namelen) == 0); } /* * Check whether the entry can be safely used. * * All places which elide locks are supposed to call this after they are * done with reading from an entry. */ #define cache_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ }) /* * Like the above but also checks NCF_WHITE. */ #define cache_fpl_neg_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ }) VFS_SMR_DECLARE; static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache parameters"); static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0, "Total namecache capacity"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); /* * Negative entry % of namecache capacity above which automatic eviction is allowed. * * Check cache_neg_evict_cond for details. */ static u_int ncnegminpct = 3; static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, "Negative entry count above which automatic eviction is allowed"); /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ struct nchstats nchstats; /* cache effectiveness statistics */ static u_int __exclusive_cache_line neg_cycle; #define ncneghash 3 #define numneglists (ncneghash + 1) struct neglist { struct mtx nl_evict_lock; struct mtx nl_lock __aligned(CACHE_LINE_SIZE); TAILQ_HEAD(, namecache) nl_list; TAILQ_HEAD(, namecache) nl_hotlist; u_long nl_hotnum; } __aligned(CACHE_LINE_SIZE); static struct neglist neglists[numneglists]; static inline struct neglist * NCP2NEGLIST(struct namecache *ncp) { return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); } static inline struct negstate * NCP2NEGSTATE(struct namecache *ncp) { MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); return (&ncp->nc_neg); } #define numbucketlocks (ncbuckethash + 1) static u_int __read_mostly ncbuckethash; static struct mtx_padalign __read_mostly *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) #define numvnodelocks (ncvnodehash + 1) static u_int __read_mostly ncvnodehash; static struct mtx __read_mostly *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); } /* * Search the hash table for a namecache entry. Either the corresponding bucket * must be locked, or the caller must be in an SMR read section. */ static struct namecache * cache_ncp_find(struct vnode *dvp, struct componentname *cnp, uint32_t hash) { struct namecache *ncp; KASSERT(mtx_owned(HASH2BUCKETLOCK(hash)) || VFS_SMR_ENTERED(), ("%s: hash %u not locked", __func__, hash)); CK_SLIST_FOREACH(ncp, NCHHASH(hash), nc_hash) { if (cache_ncp_match(ncp, dvp, cnp)) break; } return (ncp); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp == NULL) return; ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_time; *ticksp = ncp_ts->nc_ticks; } #ifdef DEBUG_CACHE static int __read_mostly doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); #endif /* Export size information to userland */ SYSCTL_SIZEOF_STRUCT(namecache); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache statistics"); #define STATNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define STATNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); STATNODE_ULONG(count, numcache, "Number of cache entries"); STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(poszaps, numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(negzaps, numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); /* These count for vn_getcwd(), too. */ STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(fullpathfail2, numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); /* * Debug or developer statistics. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache debugging"); #define DEBUGNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); static u_long zap_bucket_relock_success; DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success, "Number of successful removals after relocking"); static u_long zap_bucket_fail; DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); static u_long zap_bucket_fail2; DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); static u_long cache_lock_vnodes_cel_3_failures; DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); static void cache_zap_locked(struct namecache *ncp); static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend); static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen); static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } /* * Directory vnodes with entries are held for two reasons: * 1. make them less of a target for reclamation in vnlru * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided * * It will be feasible to stop doing it altogether if all filesystems start * supporting lockless lookup. */ static void cache_hold_vnode(struct vnode *vp) { cache_assert_vnode_locked(vp); VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); vhold(vp); counter_u64_add(numcachehv, 1); } static void cache_drop_vnode(struct vnode *vp) { /* * Called after all locks are dropped, meaning we can't assert * on the state of v_cache_src. */ vdrop(vp); counter_u64_add(numcachehv, -1); } /* * UMA zones. */ static uma_zone_t __read_mostly cache_zone_small; static uma_zone_t __read_mostly cache_zone_small_ts; static uma_zone_t __read_mostly cache_zone_large; static uma_zone_t __read_mostly cache_zone_large_ts; char * cache_symlink_alloc(size_t size, int flags) { if (size < CACHE_ZONE_SMALL_SIZE) { return (uma_zalloc_smr(cache_zone_small, flags)); } if (size < CACHE_ZONE_LARGE_SIZE) { return (uma_zalloc_smr(cache_zone_large, flags)); } counter_u64_add(symlinktoobig, 1); SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); return (NULL); } void cache_symlink_free(char *string, size_t size) { MPASS(string != NULL); KASSERT(size < CACHE_ZONE_LARGE_SIZE, ("%s: size %zu too big", __func__, size)); if (size < CACHE_ZONE_SMALL_SIZE) { uma_zfree_smr(cache_zone_small, string); return; } if (size < CACHE_ZONE_LARGE_SIZE) { uma_zfree_smr(cache_zone_large, string); return; } __assert_unreachable(); } static struct namecache * cache_alloc_uma(int len, bool ts) { struct namecache_ts *ncp_ts; struct namecache *ncp; if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } static void cache_free_uma(struct namecache *ncp) { struct namecache_ts *ncp_ts; if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small_ts, ncp_ts); else uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small, ncp); else uma_zfree_smr(cache_zone_large, ncp); } } static struct namecache * cache_alloc(int len, bool ts) { u_long lnumcache; /* * Avoid blowout in namecache entries. * * Bugs: * 1. filesystems may end up trying to add an already existing entry * (for example this can happen after a cache miss during concurrent * lookup), in which case we will call cache_neg_evict despite not * adding anything. * 2. the routine may fail to free anything and no provisions are made * to make it try harder (see the inside for failure modes) * 3. it only ever looks at negative entries. */ lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; if (cache_neg_evict_cond(lnumcache)) { lnumcache = atomic_load_long(&numcache); } if (__predict_false(lnumcache >= ncsize)) { atomic_subtract_long(&numcache, 1); counter_u64_add(numdrops, 1); return (NULL); } return (cache_alloc_uma(len, ts)); } static void cache_free(struct namecache *ncp) { MPASS(ncp != NULL); if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); atomic_subtract_long(&numcache, 1); } static void cache_free_batch(struct cache_freebatch *batch) { struct namecache *ncp, *nnp; int i; i = 0; if (TAILQ_EMPTY(batch)) goto out; TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); i++; } atomic_subtract_long(&numcache, i); out: SDT_PROBE1(vfs, namecache, purge, batch, i); } /* * Hashing. * * The code was made to use FNV in 2001 and this choice needs to be revisited. * * Short summary of the difficulty: * The longest name which can be inserted is NAME_MAX characters in length (or * 255 at the time of writing this comment), while majority of names used in * practice are significantly shorter (mostly below 10). More importantly * majority of lookups performed find names are even shorter than that. * * This poses a problem where hashes which do better than FNV past word size * (or so) tend to come with additional overhead when finalizing the result, * making them noticeably slower for the most commonly used range. * * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c * * When looking it up the most time consuming part by a large margin (at least * on amd64) is hashing. Replacing FNV with something which pessimizes short * input would make the slowest part stand out even more. */ /* * TODO: With the value stored we can do better than computing the hash based * on the address. */ static void cache_prehash(struct vnode *vp) { vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { return (fnv_32_buf(name, len, dvp->v_nchash)); } static uint32_t cache_get_hash_iter_start(struct vnode *dvp) { return (dvp->v_nchash); } static uint32_t cache_get_hash_iter(char c, uint32_t hash) { return (fnv_32_buf(&c, 1, hash)); } static uint32_t cache_get_hash_iter_finish(uint32_t hash) { return (hash); } static inline struct nchashhead * NCP2BUCKET(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (NCHHASH(hash)); } static inline struct mtx * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_OWNED); } static void cache_assert_bucket_unlocked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_NOTOWNED); } #else #define cache_assert_bucket_locked(x) do { } while (0) #define cache_assert_bucket_unlocked(x) do { } while (0) #endif #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) static void _cache_sort_vnodes(void **p1, void **p2) { void *tmp; MPASS(*p1 != NULL || *p2 != NULL); if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_lock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_unlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); MPASS(vlp1 <= vlp2); if (vlp1 != NULL) mtx_lock(vlp1); if (vlp2 != NULL) mtx_lock(vlp2); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); static int sysctl_hitpct(SYSCTL_HANDLER_ARGS) { long poshits, neghits, miss, total; long pct; poshits = counter_u64_fetch(numposhits); neghits = counter_u64_fetch(numneghits); miss = counter_u64_fetch(nummiss); total = poshits + neghits + miss; pct = 0; if (total != 0) pct = ((poshits + neghits) * 100) / total; return (sysctl_handle_int(oidp, 0, pct, req)); } SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct, "I", "Percentage of hits"); static void cache_recalc_neg_min(void) { neg_min = (ncsize * ncnegminpct) / 100; } static int sysctl_negminpct(SYSCTL_HANDLER_ARGS) { u_int val; int error; val = ncnegminpct; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == ncnegminpct) return (0); if (val < 0 || val > 99) return (EINVAL); ncnegminpct = val; cache_recalc_neg_min(); return (0); } SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); #ifdef DEBUG_CACHE /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) CK_SLIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management * * Various workloads create plenty of negative entries and barely use them * afterwards. Moreover malicious users can keep performing bogus lookups * adding even more entries. For example "make tinderbox" as of writing this * comment ends up with 2.6M namecache entries in total, 1.2M of which are * negative. * * As such, a rather aggressive eviction method is needed. The currently * employed method is a placeholder. * * Entries are split over numneglists separate lists, each of which is further * split into hot and cold entries. Entries get promoted after getting a hit. * Eviction happens on addition of new entry. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache negative entry statistics"); SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, "Number of negative cache entries"); static COUNTER_U64_DEFINE_EARLY(neg_created); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, "Number of created negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evicted); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, "Number of evicted negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, &neg_evict_skipped_empty, "Number of times evicting failed due to lack of entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, &neg_evict_skipped_missed, "Number of times evicting failed due to target entry disappearing"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, &neg_evict_skipped_contended, "Number of times evicting failed due to contention"); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, "Number of cache hits (negative)"); static int sysctl_neg_hot(SYSCTL_HANDLER_ARGS) { int i, out; out = 0; for (i = 0; i < numneglists; i++) out += neglists[i].nl_hotnum; return (SYSCTL_OUT(req, &out, sizeof(out))); } SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", "Number of hot negative entries"); static void cache_neg_init(struct namecache *ncp) { struct negstate *ns; ncp->nc_flag |= NCF_NEGATIVE; ns = NCP2NEGSTATE(ncp); ns->neg_flag = 0; ns->neg_hit = 0; counter_u64_add(neg_created, 1); } #define CACHE_NEG_PROMOTION_THRESH 2 static bool cache_neg_hit_prep(struct namecache *ncp) { struct negstate *ns; u_char n; ns = NCP2NEGSTATE(ncp); n = atomic_load_char(&ns->neg_hit); for (;;) { if (n >= CACHE_NEG_PROMOTION_THRESH) return (false); if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) break; } return (n + 1 == CACHE_NEG_PROMOTION_THRESH); } /* * Nothing to do here but it is provided for completeness as some * cache_neg_hit_prep callers may end up returning without even * trying to promote. */ #define cache_neg_hit_abort(ncp) do { } while (0) static void cache_neg_hit_finish(struct namecache *ncp) { SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); counter_u64_add(numneghits, 1); } /* * Move a negative entry to the hot list. */ static void cache_neg_promote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); if ((ns->neg_flag & NEG_HOT) == 0) { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum++; ns->neg_flag |= NEG_HOT; } } /* * Move a hot negative entry to the cold list. */ static void cache_neg_demote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); MPASS(ns->neg_flag & NEG_HOT); TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); nl->nl_hotnum--; ns->neg_flag &= ~NEG_HOT; atomic_store_char(&ns->neg_hit, 0); } /* * Move a negative entry to the hot list if it matches the lookup. * * We have to take locks, but they may be contended and in the worst * case we may need to go off CPU. We don't want to spin within the * smr section and we can't block with it. Exiting the section means * the found entry could have been evicted. We are going to look it * up again. */ static bool cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, struct namecache *oncp, uint32_t hash) { struct namecache *ncp; struct neglist *nl; u_char nc_flag; nl = NCP2NEGLIST(oncp); mtx_lock(&nl->nl_lock); /* * For hash iteration. */ vfs_smr_enter(); /* * Avoid all surprises by only succeeding if we got the same entry and * bailing completely otherwise. * XXX There are no provisions to keep the vnode around, meaning we may * end up promoting a negative entry for a *new* vnode and returning * ENOENT on its account. This is the error we want to return anyway * and promotion is harmless. * * In particular at this point there can be a new ncp which matches the * search but hashes to a different neglist. */ CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp == oncp) break; } /* * No match to begin with. */ if (__predict_false(ncp == NULL)) { goto out_abort; } /* * The newly found entry may be something different... */ if (!cache_ncp_match(ncp, dvp, cnp)) { goto out_abort; } /* * ... and not even negative. */ nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) == 0) { goto out_abort; } if (!cache_ncp_canuse(ncp)) { goto out_abort; } cache_neg_promote_locked(ncp); cache_neg_hit_finish(ncp); vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (true); out_abort: vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (false); } static void cache_neg_promote(struct namecache *ncp) { struct neglist *nl; nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); cache_neg_promote_locked(ncp); mtx_unlock(&nl->nl_lock); } static void cache_neg_insert(struct namecache *ncp) { struct neglist *nl; MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); mtx_unlock(&nl->nl_lock); atomic_add_long(&numneg, 1); } static void cache_neg_remove(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); ns = NCP2NEGSTATE(ncp); mtx_lock(&nl->nl_lock); if ((ns->neg_flag & NEG_HOT) != 0) { TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum--; } else { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); } mtx_unlock(&nl->nl_lock); atomic_subtract_long(&numneg, 1); } static struct neglist * cache_neg_evict_select_list(void) { struct neglist *nl; u_int c; c = atomic_fetchadd_int(&neg_cycle, 1) + 1; nl = &neglists[c % numneglists]; if (!mtx_trylock(&nl->nl_evict_lock)) { counter_u64_add(neg_evict_skipped_contended, 1); return (NULL); } return (nl); } static struct namecache * cache_neg_evict_select_entry(struct neglist *nl) { struct namecache *ncp, *lncp; struct negstate *ns, *lns; int i; mtx_assert(&nl->nl_evict_lock, MA_OWNED); mtx_assert(&nl->nl_lock, MA_OWNED); ncp = TAILQ_FIRST(&nl->nl_list); if (ncp == NULL) return (NULL); lncp = ncp; lns = NCP2NEGSTATE(lncp); for (i = 1; i < 4; i++) { ncp = TAILQ_NEXT(ncp, nc_dst); if (ncp == NULL) break; ns = NCP2NEGSTATE(ncp); if (ns->neg_hit < lns->neg_hit) { lncp = ncp; lns = ns; } } return (lncp); } static bool cache_neg_evict(void) { struct namecache *ncp, *ncp2; struct neglist *nl; struct vnode *dvp; struct mtx *dvlp; struct mtx *blp; uint32_t hash; u_char nlen; bool evicted; nl = cache_neg_evict_select_list(); if (nl == NULL) { return (false); } mtx_lock(&nl->nl_lock); ncp = TAILQ_FIRST(&nl->nl_hotlist); if (ncp != NULL) { cache_neg_demote_locked(ncp); } ncp = cache_neg_evict_select_entry(nl); if (ncp == NULL) { counter_u64_add(neg_evict_skipped_empty, 1); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); return (false); } nlen = ncp->nc_nlen; dvp = ncp->nc_dvp; hash = cache_get_hash(ncp->nc_name, nlen, dvp); dvlp = VP2VNODELOCK(dvp); blp = HASH2BUCKETLOCK(hash); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); mtx_lock(dvlp); mtx_lock(blp); /* * Note that since all locks were dropped above, the entry may be * gone or reallocated to be something else. */ CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { if (ncp2 == ncp && ncp2->nc_dvp == dvp && ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) break; } if (ncp2 == NULL) { counter_u64_add(neg_evict_skipped_missed, 1); ncp = NULL; evicted = false; } else { MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); MPASS(blp == NCP2BUCKETLOCK(ncp)); SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, ncp->nc_name); cache_zap_locked(ncp); counter_u64_add(neg_evicted, 1); evicted = true; } mtx_unlock(blp); mtx_unlock(dvlp); if (ncp != NULL) cache_free(ncp); return (evicted); } /* * Maybe evict a negative entry to create more room. * * The ncnegfactor parameter limits what fraction of the total count * can comprise of negative entries. However, if the cache is just * warming up this leads to excessive evictions. As such, ncnegminpct * (recomputed to neg_min) dictates whether the above should be * applied. * * Try evicting if the cache is close to full capacity regardless of * other considerations. */ static bool cache_neg_evict_cond(u_long lnumcache) { u_long lnumneg; if (ncsize - 1000 < lnumcache) goto out_evict; lnumneg = atomic_load_long(&numneg); if (lnumneg < neg_min) return (false); if (lnumneg * ncnegfactor < lnumcache) return (false); out_evict: return (cache_neg_evict()); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp) { struct nchashhead *ncpp; struct vnode *dvp, *vp; dvp = ncp->nc_dvp; vp = ncp->nc_vp; if (!(ncp->nc_flag & NCF_NEGATIVE)) cache_assert_vnode_locked(vp); cache_assert_vnode_locked(dvp); cache_assert_bucket_locked(ncp); cache_ncp_invalidate(ncp); ncpp = NCP2BUCKET(ncp); CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); if (ncp == vp->v_cache_dd) { atomic_store_ptr(&vp->v_cache_dd, NULL); } } else { SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); cache_neg_remove(ncp); } if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == dvp->v_cache_dd) { atomic_store_ptr(&dvp->v_cache_dd, NULL); } } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; } } } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct mtx *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct mtx *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_flag & NCF_NEGATIVE) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } /* * If trylocking failed we can get here. We know enough to take all needed locks * in the right order and re-lookup the entry. */ static int cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, struct mtx *blp) { struct namecache *rncp; struct mtx *rvlp; cache_assert_bucket_unlocked(ncp); cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); mtx_lock(blp); CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && cache_ncp_match(rncp, dvp, cnp)) break; } if (rncp == NULL) goto out_mismatch; if (!(ncp->nc_flag & NCF_NEGATIVE)) rvlp = VP2VNODELOCK(rncp->nc_vp); else rvlp = NULL; if (rvlp != vlp) goto out_mismatch; cache_zap_locked(rncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); atomic_add_long(&zap_bucket_relock_success, 1); return (0); out_mismatch: mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (EAGAIN); } static int __noinline cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, uint32_t hash, struct mtx *blp) { struct mtx *dvlp, *vlp; struct vnode *dvp; cache_assert_bucket_locked(ncp); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } dvp = ncp->nc_dvp; mtx_unlock(blp); return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); } static __noinline int cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { dvlp = VP2VNODELOCK(dvp); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { atomic_store_ptr(&dvp->v_cache_dd, NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); return (1); } /* * XXX note that access here is completely unlocked with no provisions * to keep the hash allocated. If one is sufficiently unlucky a * parallel cache resize can reallocate the hash, unmap backing pages * and cause the empty check below to fault. * * Fixing this has epsilon priority, but can be done with no overhead * for this codepath with sufficient effort. */ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: if (CK_SLIST_EMPTY(NCHHASH(hash))) goto out_no_entry; mtx_lock(blp); ncp = cache_ncp_find(dvp, cnp, hash); if (ncp == NULL) { mtx_unlock(blp); goto out_no_entry; } error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { atomic_add_long(&zap_bucket_fail, 1); goto retry; } counter_u64_add(numposzaps, 1); SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); cache_free(ncp); return (1); out_no_entry: counter_u64_add(nummisszap, 1); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } static int __noinline cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { int ltype; *vpp = dvp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; vrefact(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED((*vpp))) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } static int __noinline cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; struct namecache *ncp; struct mtx *dvlp; enum vgetstate vs; int error, ltype; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) != 0); if ((cnp->cn_flags & MAKEENTRY) == 0) { cache_remove_cnp(dvp, cnp); return (0); } retry: dvlp = VP2VNODELOCK(dvp); mtx_lock(dvlp); ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); mtx_unlock(dvlp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_flag & NCF_NEGATIVE) *vpp = NULL; else *vpp = ncp->nc_vp; } else *vpp = ncp->nc_dvp; if (*vpp == NULL) goto negative_success; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_dotdottime; } MPASS(dvp != *vpp); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); vs = vget_prep(*vpp); mtx_unlock(dvlp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); vn_lock(dvp, ltype | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: if (__predict_false(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); cache_zap_negative_locked_vnode_kl(ncp, dvp); mtx_unlock(dvlp); cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(dvlp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } /** * Lookup a name in the name cache * * # Arguments * * - dvp: Parent directory in which to search. * - vpp: Return argument. Will contain desired vnode on cache hit. * - cnp: Parameters of the name search. The most interesting bits of * the cn_flags field have the following meanings: * - MAKEENTRY: If clear, free an entry from the cache rather than look * it up. * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." * - tsp: Return storage for cache timestamp. On a successful (positive * or negative) lookup, tsp will be filled with any timespec that * was stored when this cache entry was created. However, it will * be clear for "." entries. * - ticks: Return storage for alternate cache timestamp. On a successful * (positive or negative) lookup, it will contain the ticks value * that was current when the cache entry was created, unless cnp * was ".". * * Either both tsp and ticks have to be provided or neither of them. * * # Returns * * - -1: A positive cache hit. vpp will contain the desired vnode. * - ENOENT: A negative cache hit, or dvp was recycled out from under us due * to a forced unmount. vpp will not be modified. If the entry * is a whiteout, then the ISWHITEOUT flag will be set in * cnp->cn_flags. * - 0: A cache miss. vpp will not be modified. * * # Locking * * On a cache hit, vpp will be returned locked and ref'd. If we're looking up * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the * lock is not recursively acquired. */ static int __noinline cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; enum vgetstate vs; int error; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) == 0); MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); retry: hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); ncp = cache_ncp_find(dvp, cnp, hash); if (__predict_false(ncp == NULL)) { mtx_unlock(blp); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } if (ncp->nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); vs = vget_prep(*vpp); mtx_unlock(blp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: /* * We don't get here with regular lookup apart from corner cases. */ if (__predict_true(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { atomic_add_long(&zap_bucket_fail2, 1); goto retry; } cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(blp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; uint32_t hash; enum vgetstate vs; int error; bool whiteout, neg_promote; u_short nc_flag; MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { cnp->cn_flags &= ~MAKEENTRY; return (0); } #endif if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); } MPASS((cnp->cn_flags & ISDOTDOT) == 0); if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { cache_remove_cnp(dvp, cnp); return (0); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); vfs_smr_enter(); ncp = cache_ncp_find(dvp, cnp, hash); if (__predict_false(ncp == NULL)) { vfs_smr_exit(); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } nc_flag = atomic_load_char(&ncp->nc_flag); if (nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); if (!cache_ncp_canuse(ncp)) { vfs_smr_exit(); *vpp = NULL; goto out_fallback; } vs = vget_prep_smr(*vpp); vfs_smr_exit(); if (__predict_false(vs == VGET_NONE)) { *vpp = NULL; goto out_fallback; } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto out_fallback; } return (-1); negative_success: if (cnp->cn_nameiop == CREATE) { if (cnp->cn_flags & ISLASTCN) { vfs_smr_exit(); goto out_fallback; } } cache_out_ts(ncp, tsp, ticksp); whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); neg_promote = cache_neg_hit_prep(ncp); if (!cache_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); vfs_smr_exit(); goto out_fallback; } if (neg_promote) { vfs_smr_exit(); if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) goto out_fallback; } else { cache_neg_hit_finish(ncp); vfs_smr_exit(); } if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); out_fallback: return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); } struct celockstate { struct mtx *vlp[3]; struct mtx *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL); vlp = VP2VNODELOCK(vp); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_unlock_vnodes_cel(cel); atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, struct mtx *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort_vnodes(&blp1, &blp2); if (blp1 != NULL) { mtx_lock(blp1); cel->blp[0] = blp1; } mtx_lock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) mtx_unlock(cel->blp[0]); mtx_unlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } static void __noinline cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct celockstate cel; struct namecache *ncp; uint32_t hash; int len; if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) return; len = cnp->cn_namelen; cache_celockstate_init(&cel); hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); ncp = dvp->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); cache_zap_locked(ncp); } else { ncp = NULL; } atomic_store_ptr(&dvp->v_cache_dd, NULL); cache_enter_unlock(&cel); if (ncp != NULL) cache_free(ncp); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *ncp_ts; uint32_t hash; int flag; int len; KASSERT(cnp->cn_namelen <= NAME_MAX, ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, NAME_MAX)); VNPASS(!VN_IS_DOOMED(dvp), dvp); VNPASS(dvp->v_type != VNON, dvp); if (vp != NULL) { VNPASS(!VN_IS_DOOMED(vp), vp); VNPASS(vp->v_type != VNON, vp); } if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { KASSERT(dvp == vp, ("%s: different vnodes for dot entry (%p; %p)\n", __func__, dvp, vp)); } else { KASSERT(dvp != vp, ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, cnp->cn_nameptr, dvp)); } #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) return; #endif flag = 0; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { cache_enter_dotdot_prep(dvp, vp, cnp); flag = NCF_ISDOTDOT; } } ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); if (ncp == NULL) return; cache_celockstate_init(&cel); ndd = NULL; ncp_ts = NULL; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp->nc_flag = flag | NCF_WIP; ncp->nc_vp = vp; if (vp == NULL) cache_neg_init(ncp); ncp->nc_dvp = dvp; if (tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); ncp_ts->nc_time = *tsp; ncp_ts->nc_ticks = ticks; ncp_ts->nc_nc.nc_flag |= NCF_TS; if (dtsp != NULL) { ncp_ts->nc_dotdottime = *dtsp; ncp_ts->nc_nc.nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); memcpy(ncp->nc_name, cnp->cn_nameptr, len); ncp->nc_name[len] = '\0'; cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ n2 = cache_ncp_find(dvp, cnp, hash); if (n2 != NULL) { MPASS(cache_ncp_canuse(n2)); if ((n2->nc_flag & NCF_NEGATIVE) != 0) KASSERT(vp == NULL, ("%s: found entry pointing to a different vnode " "(%p != %p); name [%s]", __func__, NULL, vp, cnp->cn_nameptr)); else KASSERT(n2->nc_vp == vp, ("%s: found entry pointing to a different vnode " "(%p != %p); name [%s]", __func__, n2->nc_vp, vp, cnp->cn_nameptr)); /* * Entries are supposed to be immutable unless in the * process of getting destroyed. Accommodating for * changing timestamps is possible but not worth it. * This should be harmless in terms of correctness, in * the worst case resulting in an earlier expiration. * Alternatively, the found entry can be replaced * altogether. */ MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); #if 0 if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n2_ts = __containerof(n2, struct namecache_ts, nc_nc); n2_ts->nc_time = ncp_ts->nc_time; n2_ts->nc_ticks = ncp_ts->nc_ticks; if (dtsp != NULL) { n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; n2_ts->nc_nc.nc_flag |= NCF_DTS; } } #endif SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, vp); goto out_unlock_free; } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); atomic_thread_fence_rel(); atomic_store_ptr(&dvp->v_cache_dd, ncp); } else if (vp != NULL) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd); else ndd = NULL; } atomic_thread_fence_rel(); atomic_store_ptr(&vp->v_cache_dd, ncp); } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { cache_hold_vnode(dvp); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, vp); } else { if (cnp->cn_flags & ISWHITEOUT) atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); cache_neg_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); atomic_thread_fence_rel(); /* * Mark the entry as fully constructed. * It is immutable past this point until its removal. */ atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); cache_enter_unlock(&cel); if (ndd != NULL) cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); cache_free(ncp); return; } /* * A variant of the above accepting flags. * * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. * * TODO: this routine is a hack. It blindly removes the old entry, even if it * happens to match and it is doing it in an inefficient manner. It was added * to accommodate NFS which runs into a case where the target for a given name * may change from under it. Note this does nothing to solve the following * race: 2 callers of cache_enter_time_flags pass a different target vnode for * the same [dvp, cnp]. It may be argued that code doing this is broken. */ void cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp, int flags) { MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); if (flags & VFS_CACHE_DROPOLD) cache_remove_cnp(dvp, cnp); cache_enter_time(dvp, vp, cnp, tsp, dtsp); } static u_long cache_roundup_2(u_long val) { u_long res; for (res = 1; res <= val; res <<= 1) continue; return (res); } static struct nchashhead * nchinittbl(u_long elements, u_long *hashmask) { struct nchashhead *hashtbl; u_long hashsize, i; hashsize = cache_roundup_2(elements) / 2; hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); for (i = 0; i < hashsize; i++) CK_SLIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } static void ncfreetbl(struct nchashhead *hashtbl) { free(hashtbl, M_VFSCACHE); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); VFS_SMR_ZONE_SET(cache_zone_small); VFS_SMR_ZONE_SET(cache_zone_small_ts); VFS_SMR_ZONE_SET(cache_zone_large); VFS_SMR_ZONE_SET(cache_zone_large_ts); ncsize = desiredvnodes * ncsizefactor; cache_recalc_neg_min(); nchashtbl = nchinittbl(ncsize, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ ncbuckethash = 7; if (ncbuckethash > nchash) ncbuckethash = nchash; bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); ncvnodehash = ncbuckethash; vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); for (i = 0; i < numneglists; i++) { mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); TAILQ_INIT(&neglists[i].nl_list); TAILQ_INIT(&neglists[i].nl_hotlist); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_vnode_init(struct vnode *vp) { LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); vp->v_cache_dd = NULL; cache_prehash(vp); } /* * Induce transient cache misses for lockless operation in cache_lookup() by * using a temporary hash table. * * This will force a fs lookup. * * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time * to observe all CPUs not performing the lookup. */ static void cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) { MPASS(temphash < nchash); /* * Change the size. The new size is smaller and can safely be used * against the existing table. All lookups which now hash wrong will * result in a cache miss, which all callers are supposed to know how * to handle. */ atomic_store_long(&nchash, temphash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated hash value, but they still * see the old table. */ atomic_store_ptr(&nchashtbl, temptbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } /* * Set the new hash table. * * Similarly to cache_changesize_set_temp(), this has to synchronize against * lockless operation in cache_lookup(). */ static void cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) { MPASS(nchash < new_hash); /* * Change the pointer first. This wont result in out of bounds access * since the temporary table is guaranteed to be smaller. */ atomic_store_ptr(&nchashtbl, new_tbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated pointer value, but they * still see the old size. */ atomic_store_long(&nchash, new_hash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } void cache_changesize(u_long newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; u_long new_nchash, old_nchash, temphash; struct namecache *ncp; uint32_t hash; u_long newncsize; u_long i; newncsize = newmaxvnodes * ncsizefactor; newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); if (newmaxvnodes < numbucketlocks) newmaxvnodes = numbucketlocks; new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { ncfreetbl(new_nchashtbl); return; } temptbl = nchinittbl(1, &temphash); /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; cache_changesize_set_temp(temptbl, temphash); for (i = 0; i <= old_nchash; i++) { while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); } } ncsize = newncsize; cache_recalc_neg_min(); cache_changesize_set_new(new_nchashtbl, new_nchash); cache_unlock_all_buckets(); cache_unlock_all_vnodes(); ncfreetbl(old_nchashtbl); ncfreetbl(temptbl); } /* * Remove all entries from and to a particular vnode. */ static void cache_purge_impl(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp; struct mtx *vlp, *vlp2; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_lock(vlp); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); cache_free_batch(&batch); } /* * Opportunistic check to see if there is anything to do. */ static bool cache_has_entries(struct vnode *vp) { if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && atomic_load_ptr(&vp->v_cache_dd) == NULL) return (false); return (true); } void cache_purge(struct vnode *vp) { SDT_PROBE1(vfs, namecache, purge, done, vp); if (!cache_has_entries(vp)) return; cache_purge_impl(vp); } /* * Only to be used by vgone. */ void cache_purge_vgone(struct vnode *vp) { struct mtx *vlp; VNPASS(VN_IS_DOOMED(vp), vp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } /* * Serialize against a potential thread doing cache_purge. */ vlp = VP2VNODELOCK(vp); mtx_wait_unlocked(vlp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } return; } /* * Remove all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp, *nnp; struct mtx *vlp; SDT_PROBE1(vfs, namecache, purge_negative, done, vp); if (LIST_EMPTY(&vp->v_cache_src)) return; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (!(ncp->nc_flag & NCF_NEGATIVE)) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } mtx_unlock(vlp); cache_free_batch(&batch); } /* * Entry points for modifying VOP operations. */ void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { ASSERT_VOP_IN_SEQC(fdvp); ASSERT_VOP_IN_SEQC(fvp); ASSERT_VOP_IN_SEQC(tdvp); if (tvp != NULL) ASSERT_VOP_IN_SEQC(tvp); cache_purge(fvp); if (tvp != NULL) { cache_purge(tvp); KASSERT(!cache_remove_cnp(tdvp, tcnp), ("%s: lingering negative entry", __func__)); } else { cache_remove_cnp(tdvp, tcnp); } /* * TODO * * Historically renaming was always purging all revelang entries, * but that's quite wasteful. In particular turns out that in many cases * the target file is immediately accessed after rename, inducing a cache * miss. * * Recode this to reduce relocking and reuse the existing entry (if any) * instead of just removing it above and allocating a new one here. */ cache_enter(tdvp, fvp, tcnp); } void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) { ASSERT_VOP_IN_SEQC(dvp); ASSERT_VOP_IN_SEQC(vp); cache_purge(vp); } #ifdef INVARIANTS /* * Validate that if an entry exists it matches. */ void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); if (CK_SLIST_EMPTY(NCHHASH(hash))) return; blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); ncp = cache_ncp_find(dvp, cnp, hash); if (ncp != NULL && ncp->nc_vp != vp) { panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); } mtx_unlock(blp); } void cache_assert_no_entries(struct vnode *vp) { VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); VNPASS(vp->v_cache_dd == NULL, vp); } #endif /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(struct mount *mp) { struct vnode *vp, *mvp; size_t visited __sdt_used, purged __sdt_used; visited = purged = 0; /* * Somewhat wasteful iteration over all vnodes. Would be better to * support filtering and avoid the interlock to begin with. */ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { visited++; if (!cache_has_entries(vp)) { VI_UNLOCK(vp); continue; } vholdl(vp); VI_UNLOCK(vp); cache_purge(vp); purged++; vdrop(vp); } SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { char *buf, *retbuf; size_t buflen; int error; buflen = uap->buflen; if (__predict_false(buflen < 2)) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; buf = uma_zalloc(namei_zone, M_WAITOK); error = vn_getcwd(buf, &retbuf, &buflen); if (error == 0) error = copyout(retbuf, uap->buf, buflen); uma_zfree(namei_zone, buf); return (error); } int vn_getcwd(char *buf, char **retbuf, size_t *buflen) { struct pwd *pwd; int error; vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); pwd_drop(pwd); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) ktrnamei(*retbuf); #endif return (error); } /* * Canonicalize a path by walking it forward and back. * * BUGS: * - Nothing guarantees the integrity of the entire chain. Consider the case * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of * "foo" into "quux" during the backwards walk. The result will be * "quux/bar/baz/qux", which could not have been obtained by an incremental * walk in userspace. Moreover, the path we return is inaccessible if the * calling thread lacks permission to traverse "quux". */ static int kern___realpathat(struct thread *td, int fd, const char *path, char *buf, size_t size, int flags, enum uio_seg pathseg) { struct nameidata nd; char *retbuf, *freebuf; int error; if (flags != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR && (nd.ni_vp->v_vflag & VV_ROOT) != 0) { struct vnode *covered_vp; /* * This happens if vp is a file mount. The call to * vn_fullpath_hardlink can panic if path resolution can't be * handled without the directory. * * To resolve this, we find the vnode which was mounted on - * this should have a unique global path since we disallow * mounting on linked files. */ error = vn_lock(nd.ni_vp, LK_SHARED); if (error != 0) goto out; covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered; vref(covered_vp); VOP_UNLOCK(nd.ni_vp); error = vn_fullpath(covered_vp, &retbuf, &freebuf); vrele(covered_vp); } else { error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size); } if (error == 0) { size_t len; len = strlen(retbuf) + 1; if (size < len) error = ENAMETOOLONG; else if (pathseg == UIO_USERSPACE) error = copyout(retbuf, buf, len); else memcpy(buf, retbuf, len); free(freebuf, M_TEMP); } out: vrele(nd.ni_vp); vrele(nd.ni_dvp); NDFREE_PNBUF(&nd); return (error); } int sys___realpathat(struct thread *td, struct __realpathat_args *uap) { return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, uap->flags, UIO_USERSPACE)); } /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) { struct pwd *pwd; char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); pwd_drop(pwd); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) { char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } static struct namecache * vn_dd_from_dst(struct vnode *vp) { struct namecache *ncp; cache_assert_vnode_locked(vp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) return (ncp); } return (NULL); } int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); ncp = (*vp)->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { KASSERT(ncp == vn_dd_from_dst(*vp), ("%s: mismatch for dd entry (%p != %p)", __func__, ncp, vn_dd_from_dst(*vp))); } else { ncp = vn_dd_from_dst(*vp); } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, ncp->nc_name, vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (VN_IS_DOOMED(dvp)) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * Resolve a directory to a pathname. * * The name of the directory can always be found in the namecache or fetched * from the filesystem. There is also guaranteed to be only one parent, meaning * we can just follow vnodes up until we find the root. * * The vnode must be referenced. */ static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; size_t buflen; int error; bool slash_prefixed; VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); VNPASS(vp->v_usecount > 0, vp); buflen = *len; slash_prefixed = true; if (addend == 0) { MPASS(*len >= 2); buflen--; buf[buflen] = '\0'; slash_prefixed = false; } error = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); while (vp != rdir && vp != rootvnode) { /* * The vp vnode must be already fully constructed, * since it is either found in namecache or obtained * from VOP_VPTOCNP(). We may test for VV_ROOT safely * without obtaining the vnode lock. */ if ((vp->v_vflag & VV_ROOT) != 0) { vn_lock(vp, LK_RETRY | LK_SHARED); /* * With the vnode locked, check for races with * unmount, forced or not. Note that we * already verified that vp is not equal to * the root vnode, which means that * mnt_vnodecovered can be NULL only for the * case of unmount. */ if (VN_IS_DOOMED(vp) || (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || vp1->v_mountedhere != vp->v_mount) { vput(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vref(vp1); vput(vp); vp = vp1; continue; } VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); error = vn_vptocnp(&vp, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = true; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); *retbuf = buf + buflen; SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); *len -= buflen; *len += addend; return (0); } /* * Resolve an arbitrary vnode to a pathname. * * Note 2 caveats: * - hardlinks are not tracked, thus if the vnode is not a directory this can * resolve to a different path than the one used to find it * - namecache is not mandatory, meaning names are not guaranteed to be added * (in which case resolving fails) */ static void __inline cache_rev_failed_impl(int *reason, int line) { *reason = line; } #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *tvp; struct mount *mp; struct namecache *ncp; size_t orig_buflen; int reason; int error; #ifdef KDTRACE_HOOKS int i; #endif seqc_t vp_seqc, tvp_seqc; u_char nc_flag; VFS_SMR_ASSERT_ENTERED(); if (!atomic_load_char(&cache_fast_lookup_enabled)) { vfs_smr_exit(); return (-1); } orig_buflen = *buflen; if (addend == 0) { MPASS(*buflen >= 2); *buflen -= 1; buf[*buflen] = '\0'; } if (vp == rdir || vp == rootvnode) { if (addend == 0) { *buflen -= 1; buf[*buflen] = '/'; } goto out_ok; } #ifdef KDTRACE_HOOKS i = 0; #endif error = -1; ncp = NULL; /* for sdt probe down below */ vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } for (;;) { #ifdef KDTRACE_HOOKS i++; #endif if ((vp->v_vflag & VV_ROOT) != 0) { mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) { cache_rev_failed(&reason); goto out_abort; } tvp = atomic_load_ptr(&mp->mnt_vnodecovered); tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; continue; } ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) { cache_rev_failed(&reason); goto out_abort; } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { cache_rev_failed(&reason); goto out_abort; } if (ncp->nc_nlen >= *buflen) { cache_rev_failed(&reason); error = ENOMEM; goto out_abort; } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); *buflen -= 1; buf[*buflen] = '/'; tvp = ncp->nc_dvp; tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { cache_rev_failed(&reason); goto out_abort; } if (!cache_ncp_canuse(ncp)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; if (vp == rdir || vp == rootvnode) break; } out_ok: vfs_smr_exit(); *retbuf = buf + *buflen; *buflen = orig_buflen - *buflen + addend; SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); return (0); out_abort: *buflen = orig_buflen; SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); vfs_smr_exit(); return (error); } static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen) { size_t orig_buflen, addend; int error; if (*buflen < 2) return (EINVAL); orig_buflen = *buflen; vref(vp); addend = 0; if (vp->v_type != VDIR) { *buflen -= 1; buf[*buflen] = '\0'; error = vn_vptocnp(&vp, buf, buflen); if (error) return (error); if (*buflen == 0) { vrele(vp); return (ENOMEM); } *buflen -= 1; buf[*buflen] = '/'; addend = orig_buflen - *buflen; } return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); } /* * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). * * Since the namecache does not track hardlinks, the caller is expected to * first look up the target vnode with WANTPARENT flag passed to namei to get * dvp and vp. * * Then we have 2 cases: * - if the found vnode is a directory, the path can be constructed just by * following names up the chain * - otherwise we populate the buffer with the saved name and start resolving * from the parent */ int vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, const char *hrdl_name, size_t hrdl_name_length, char **retbuf, char **freebuf, size_t *buflen) { char *buf, *tmpbuf; struct pwd *pwd; size_t addend; int error; __enum_uint8(vtype) type; if (*buflen < 2) return (EINVAL); if (*buflen > MAXPATHLEN) *buflen = MAXPATHLEN; buf = malloc(*buflen, M_TEMP, M_WAITOK); addend = 0; /* * Check for VBAD to work around the vp_crossmp bug in lookup(). * * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be * set to mount point's root vnode while ni_dvp will be vp_crossmp. * If the type is VDIR (like in this very case) we can skip looking * at ni_dvp in the first place. However, since vnodes get passed here * unlocked the target may transition to doomed state (type == VBAD) * before we get to evaluate the condition. If this happens, we will * populate part of the buffer and descend to vn_fullpath_dir with * vp == vp_crossmp. Prevent the problem by checking for VBAD. */ type = atomic_load_8(&vp->v_type); if (type == VBAD) { error = ENOENT; goto out_bad; } if (type != VDIR) { addend = hrdl_name_length + 2; if (*buflen < addend) { error = ENOMEM; goto out_bad; } *buflen -= addend; tmpbuf = buf + *buflen; tmpbuf[0] = '/'; memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); tmpbuf[addend - 1] = '\0'; vp = dvp; } vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); vref(vp); error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); pwd_drop(pwd); } if (error != 0) goto out_bad; *freebuf = buf; return (0); out_bad: free(buf, M_TEMP); return (error); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; enum vgetstate vs; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vs = vget_prep(ddvp); mtx_unlock(vlp); if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, ncp->nc_name, l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp); error = vn_fullpath_global(vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE_PNBUF(&nd); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } /* * This is similar to vn_path_to_global_path but allows for regular * files which may not be present in the cache. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. */ int vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp, struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name, size_t leaf_length) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; size_t len; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* * Construct global filesystem path from dvp, vp and leaf * name. */ VOP_UNLOCK(vp); len = pathlen; error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length, &rpath, &fbuf, &len); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE_PNBUF(&nd); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } #ifdef DDB static void db_print_vpath(struct vnode *vp) { while (vp != NULL) { db_printf("%p: ", vp); if (vp == rootvnode) { db_printf("/"); vp = NULL; } else { if (vp->v_vflag & VV_ROOT) { db_printf(""); vp = vp->v_mount->mnt_vnodecovered; } else { struct namecache *ncp; char *ncn; int i; ncp = TAILQ_FIRST(&vp->v_cache_dst); if (ncp != NULL) { ncn = ncp->nc_name; for (i = 0; i < ncp->nc_nlen; i++) db_printf("%c", *ncn++); vp = ncp->nc_dvp; } else { vp = NULL; } } } db_printf("\n"); } return; } DB_SHOW_COMMAND(vpath, db_show_vpath) { struct vnode *vp; if (!have_addr) { db_printf("usage: show vpath \n"); return; } vp = (struct vnode *)addr; db_print_vpath(vp); } #endif static int cache_fast_lookup = 1; #define CACHE_FPL_FAILED -2020 static int cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v) { vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n"); panic("no proper vop_fplookup_vexec"); } static int cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v) { vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n"); panic("no proper vop_fplookup_symlink"); } void cache_vop_vector_register(struct vop_vector *v) { size_t ops; ops = 0; if (v->vop_fplookup_vexec != NULL) { ops++; } if (v->vop_fplookup_symlink != NULL) { ops++; } if (ops == 2) { return; } if (ops == 0) { v->vop_fplookup_vexec = cache_vop_bad_vexec; v->vop_fplookup_symlink = cache_vop_bad_symlink; return; } printf("%s: invalid vop vector %p -- either all or none fplookup vops " "need to be provided", __func__, v); if (v->vop_fplookup_vexec == NULL) { printf("%s: missing vop_fplookup_vexec\n", __func__); } if (v->vop_fplookup_symlink == NULL) { printf("%s: missing vop_fplookup_symlink\n", __func__); } panic("bad vop vector %p", v); } #ifdef INVARIANTS void cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops) { if (mp == NULL) return; if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return; if (vops->vop_fplookup_vexec == NULL || vops->vop_fplookup_vexec == cache_vop_bad_vexec) panic("bad vop_fplookup_vexec on vector %p for filesystem %s", vops, mp->mnt_vfc->vfc_name); if (vops->vop_fplookup_symlink == NULL || vops->vop_fplookup_symlink == cache_vop_bad_symlink) panic("bad vop_fplookup_symlink on vector %p for filesystem %s", vops, mp->mnt_vfc->vfc_name); } #endif void cache_fast_lookup_enabled_recalc(void) { int lookup_flag; int mac_on; #ifdef MAC mac_on = mac_vnode_check_lookup_enabled(); mac_on |= mac_vnode_check_readlink_enabled(); #else mac_on = 0; #endif lookup_flag = atomic_load_int(&cache_fast_lookup); if (lookup_flag && !mac_on) { atomic_store_char(&cache_fast_lookup_enabled, true); } else { atomic_store_char(&cache_fast_lookup_enabled, false); } } static int syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) { int error, old; old = atomic_load_int(&cache_fast_lookup); error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) cache_fast_lookup_enabled_recalc(); return (error); } SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); /* * Components of nameidata (or objects it can point to) which may * need restoring in case fast path lookup fails. */ struct nameidata_outer { size_t ni_pathlen; - int cn_flags; + uint64_t cn_flags; }; struct nameidata_saved { #ifdef INVARIANTS char *cn_nameptr; size_t ni_pathlen; #endif }; #ifdef INVARIANTS struct cache_fpl_debug { size_t ni_pathlen; }; #endif struct cache_fpl { struct nameidata *ndp; struct componentname *cnp; char *nulchar; struct vnode *dvp; struct vnode *tvp; seqc_t dvp_seqc; seqc_t tvp_seqc; uint32_t hash; struct nameidata_saved snd; struct nameidata_outer snd_outer; int line; enum cache_fpl_status status:8; bool in_smr; bool fsearch; struct pwd **pwd; #ifdef INVARIANTS struct cache_fpl_debug debug; #endif }; static bool cache_fplookup_mp_supported(struct mount *mp); static bool cache_fplookup_is_mp(struct cache_fpl *fpl); static int cache_fplookup_cross_mount(struct cache_fpl *fpl); static int cache_fplookup_partial_setup(struct cache_fpl *fpl); static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); static int cache_fplookup_trailingslash(struct cache_fpl *fpl); static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); static void cache_fpl_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; } static struct vnode * cache_fpl_handle_root(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } return (ndp->ni_rootdir); } static void cache_fpl_checkpoint_outer(struct cache_fpl *fpl) { fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; } static void cache_fpl_checkpoint(struct cache_fpl *fpl) { #ifdef INVARIANTS fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; #endif } static void cache_fpl_restore_partial(struct cache_fpl *fpl) { fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; #ifdef INVARIANTS fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; #endif } static void cache_fpl_restore_abort(struct cache_fpl *fpl) { cache_fpl_restore_partial(fpl); /* * It is 0 on entry by API contract. */ fpl->ndp->ni_resflags = 0; fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; } #ifdef INVARIANTS #define cache_fpl_smr_assert_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ VFS_SMR_ASSERT_ENTERED(); \ }) #define cache_fpl_smr_assert_not_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ VFS_SMR_ASSERT_NOT_ENTERED(); \ }) static void cache_fpl_assert_status(struct cache_fpl *fpl) { switch (fpl->status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_DESTROYED: case CACHE_FPL_STATUS_ABORTED: case CACHE_FPL_STATUS_PARTIAL: case CACHE_FPL_STATUS_HANDLED: break; } } #else #define cache_fpl_smr_assert_entered(fpl) do { } while (0) #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) #define cache_fpl_assert_status(fpl) do { } while (0) #endif #define cache_fpl_smr_enter_initial(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_enter(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_exit(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ vfs_smr_exit(); \ _fpl->in_smr = false; \ }) static int cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) { if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) static int __noinline cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; if (fpl->in_smr) cache_fpl_smr_exit(fpl); cache_fpl_restore_abort(fpl); /* * Resolving symlinks overwrites data passed by the caller. * Let namei know. */ if (ndp->ni_loopcnt > 0) { fpl->status = CACHE_FPL_STATUS_DESTROYED; cache_fpl_cleanup_cnp(cnp); } return (CACHE_FPL_FAILED); } #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) static int __noinline cache_fpl_partial_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to partial at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_entered(fpl); fpl->status = CACHE_FPL_STATUS_PARTIAL; fpl->line = line; return (cache_fplookup_partial_setup(fpl)); } #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) static int cache_fpl_handled_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; return (0); } #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) static int cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); MPASS(error != 0); MPASS(error != CACHE_FPL_FAILED); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; fpl->dvp = NULL; fpl->tvp = NULL; return (error); } #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) static bool cache_fpl_terminated(struct cache_fpl *fpl) { return (fpl->status != CACHE_FPL_STATUS_UNSET); } #define CACHE_FPL_SUPPORTED_CN_FLAGS \ (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \ ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ OPENWRITE | WANTIOCTLCAPS | NAMEILOOKUP) #define CACHE_FPL_INTERNAL_CN_FLAGS \ (ISDOTDOT | MAKEENTRY | ISLASTCN) _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, "supported and internal flags overlap"); static bool cache_fpl_islastcn(struct nameidata *ndp) { return (*ndp->ni_next == 0); } static bool cache_fpl_istrailingslash(struct cache_fpl *fpl) { MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); return (*(fpl->nulchar - 1) == '/'); } static bool cache_fpl_isdotdot(struct componentname *cnp) { if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') return (true); return (false); } static bool cache_can_fplookup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct thread *td; ndp = fpl->ndp; cnp = fpl->cnp; td = curthread; if (!atomic_load_char(&cache_fast_lookup_enabled)) { cache_fpl_aborted_early(fpl); return (false); } if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { cache_fpl_aborted_early(fpl); return (false); } if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) { cache_fpl_aborted_early(fpl); return (false); } if (AUDITING_TD(td)) { cache_fpl_aborted_early(fpl); return (false); } if (ndp->ni_startdir != NULL) { cache_fpl_aborted_early(fpl); return (false); } return (true); } static int __noinline cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) { struct nameidata *ndp; struct componentname *cnp; int error; bool fsearch; ndp = fpl->ndp; cnp = fpl->cnp; error = fgetvp_lookup_smr(ndp, vpp, &fsearch); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } fpl->fsearch = fsearch; if ((*vpp)->v_type != VDIR) { if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } } return (0); } static int __noinline cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, uint32_t hash) { struct componentname *cnp; struct vnode *dvp; cnp = fpl->cnp; dvp = fpl->dvp; cache_fpl_smr_exit(fpl); if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) return (cache_fpl_handled_error(fpl, ENOENT)); else return (cache_fpl_aborted(fpl)); } /* * The target vnode is not supported, prepare for the slow path to take over. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp; struct pwd *pwd; seqc_t dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; pwd = *(fpl->pwd); dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; if (!pwd_hold_smr(pwd)) { return (cache_fpl_aborted(fpl)); } /* * Note that seqc is checked before the vnode is locked, so by * the time regular lookup gets to it it may have moved. * * Ultimately this does not affect correctness, any lookup errors * are userspace racing with itself. It is guaranteed that any * path which ultimately gets found could also have been found * by regular lookup going all the way in absence of concurrent * modifications. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } cache_fpl_restore_partial(fpl); #ifdef INVARIANTS if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); } #endif ndp->ni_startdir = dvp; cnp->cn_flags |= MAKEENTRY; if (cache_fpl_islastcn(ndp)) cnp->cn_flags |= ISLASTCN; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; /* * Skip potential extra slashes parsing did not take care of. * cache_fplookup_skip_slashes explains the mechanism. */ if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif return (0); } static int cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) { struct componentname *cnp; struct vnode *tvp; seqc_t tvp_seqc; int error, lkflags; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } if (!vn_seqc_consistent(tvp, tvp_seqc)) { if ((cnp->cn_flags & LOCKLEAF) != 0) vput(tvp); else vrele(tvp); return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled(fpl)); } /* * They want to possibly modify the state of the namecache. */ static int __noinline cache_fplookup_final_modifying(struct cache_fpl *fpl) { struct nameidata *ndp __diagused; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; struct mount *mp; seqc_t dvp_seqc; int error; bool docache; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS(*(cnp->cn_nameptr) != '/'); MPASS(cache_fpl_islastcn(ndp)); if ((cnp->cn_flags & LOCKPARENT) == 0) MPASS((cnp->cn_flags & WANTPARENT) != 0); MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME); MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) docache = false; /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { cache_fpl_smr_exit(fpl); /* * Original code keeps not checking for CREATE which * might be a bug. For now let the old lookup decide. */ if (cnp->cn_nameiop == CREATE) { return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled_error(fpl, EROFS)); } if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EEXIST)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. * * XXX At least UFS requires its lookup routine to be called for * the last path component, which leads to some level of complication * and inefficiency: * - the target routine always locks the target vnode, but our caller * may not need it locked * - some of the VOP machinery asserts that the parent is locked, which * once more may be not required * * TODO: add a flag for filesystems which don't need this. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_EXCLUSIVE); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; cnp->cn_flags |= ISLASTCN; if (docache) cnp->cn_flags |= MAKEENTRY; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; cnp->cn_lkflags = LK_EXCLUSIVE; error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; if (tvp == NULL) { MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } /* * There are very hairy corner cases concerning various flag combinations * and locking state. In particular here we only hold one lock instead of * two. * * Skip the complexity as it is of no significance for normal workloads. */ if (__predict_false(tvp == dvp)) { vput(dvp); vrele(tvp); return (cache_fpl_aborted(fpl)); } /* * If they want the symlink itself we are fine, but if they want to * follow it regular lookup has to be engaged. */ if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } /* * Since we expect this to be the terminal vnode it should almost never * be a mount point. */ if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & FAILIFEXISTS) != 0) { vput(dvp); vput(tvp); return (cache_fpl_handled_error(fpl, EEXIST)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_modifying(struct cache_fpl *fpl) { struct nameidata *ndp; ndp = fpl->ndp; if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } return (cache_fplookup_final_modifying(fpl)); } static int __noinline cache_fplookup_final_withparent(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate dvs, tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); /* * This is less efficient than it can be for simplicity. */ dvs = vget_prep_smr(dvp); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { cache_fpl_smr_exit(fpl); vget_abort(dvp, dvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); if ((cnp->cn_flags & LOCKPARENT) != 0) { error = vget_finish(dvp, LK_EXCLUSIVE, dvs); if (__predict_false(error != 0)) { vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { vget_abort(tvp, tvs); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (cache_fpl_aborted(fpl)); } error = cache_fplookup_final_child(fpl, tvs); if (__predict_false(error != 0)) { MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || fpl->status == CACHE_FPL_STATUS_DESTROYED); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (error); } MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); return (0); } static int cache_fplookup_final(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS(*(cnp->cn_nameptr) != '/'); if (cnp->cn_nameiop != LOOKUP) { return (cache_fplookup_final_modifying(fpl)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) return (cache_fplookup_final_withparent(fpl)); tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { cache_fpl_smr_exit(fpl); vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fplookup_final_child(fpl, tvs)); } /* * Comment from locked lookup: * Check for degenerate name (e.g. / or "") which is a way of talking about a * directory, e.g. like "/." or ".". */ static int __noinline cache_fplookup_degenerate(struct cache_fpl *fpl) { struct componentname *cnp; struct vnode *dvp; enum vgetstate dvs; int error, lkflags; #ifdef INVARIANTS char *cp; #endif fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; #ifdef INVARIANTS for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { KASSERT(*cp == '/', ("%s: encountered non-slash; string [%s]\n", __func__, cnp->cn_pnbuf)); } #endif if (__predict_false(cnp->cn_nameiop != LOOKUP)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EISDIR)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { return (cache_fplookup_final_withparent(fpl)); } dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(dvp, lkflags, dvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_emptypath(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate tvs; struct vnode *tvp; int error, lkflags; fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; MPASS(*cnp->cn_pnbuf == '\0'); if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); tvs = vget_prep_smr(tvp); cache_fpl_smr_exit(fpl); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } ndp->ni_resflags |= NIRES_EMPTYPATH; return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_noentry(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); if (cnp->cn_nameiop == LOOKUP) MPASS((cnp->cn_flags & NOCACHE) == 0); MPASS(!cache_fpl_isdotdot(cnp)); /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } if (cnp->cn_nameptr[0] == '/') { return (cache_fplookup_skip_slashes(fpl)); } if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } if (cnp->cn_nameptr[0] == '\0') { if (fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } return (cache_fplookup_trailingslash(fpl)); } if (cnp->cn_nameiop != LOOKUP) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } /* * Only try to fill in the component if it is the last one, * otherwise not only there may be several to handle but the * walk may be complicated. */ if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_SHARED); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; /* * TODO: provide variants which don't require locking either vnode. */ cnp->cn_flags |= ISLASTCN | MAKEENTRY; cnp->cn_lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) { cnp->cn_lkflags = LK_EXCLUSIVE; } error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; if (tvp == NULL) { MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_dot(struct cache_fpl *fpl) { int error; MPASS(!seqc_in_modify(fpl->dvp_seqc)); if (__predict_false(fpl->dvp->v_type != VDIR)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } /* * Just re-assign the value. seqc will be checked later for the first * non-dot path component in line and/or before deciding to return the * vnode. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static int __noinline cache_fplookup_dotdot(struct cache_fpl *fpl) { struct nameidata *ndp; struct namecache *ncp; struct vnode *dvp; u_char nc_flag; ndp = fpl->ndp; dvp = fpl->dvp; MPASS(cache_fpl_isdotdot(fpl->cnp)); /* * XXX this is racy the same way regular lookup is */ if (vfs_lookup_isroot(ndp, dvp)) { fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } if ((dvp->v_vflag & VV_ROOT) != 0) { /* * TODO * The opposite of climb mount is needed here. */ return (cache_fpl_partial(fpl)); } if (__predict_false(dvp->v_type != VDIR)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { if ((nc_flag & NCF_NEGATIVE) != 0) return (cache_fpl_aborted(fpl)); fpl->tvp = ncp->nc_vp; } else { fpl->tvp = ncp->nc_dvp; } fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { return (cache_fpl_aborted(fpl)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_aborted(fpl)); } return (0); } static int __noinline cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) { u_char nc_flag __diagused; bool neg_promote; #ifdef INVARIANTS nc_flag = atomic_load_char(&ncp->nc_flag); MPASS((nc_flag & NCF_NEGATIVE) != 0); #endif /* * If they want to create an entry we need to replace this one. */ if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } neg_promote = cache_neg_hit_prep(ncp); if (!cache_fpl_neg_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); return (cache_fpl_partial(fpl)); } if (neg_promote) { return (cache_fplookup_negative_promote(fpl, ncp, hash)); } cache_neg_hit_finish(ncp); cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } /* * Resolve a symlink. Called by filesystem-specific routines. * * Code flow is: * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve */ int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) { struct nameidata *ndp; struct componentname *cnp; size_t adjust; ndp = fpl->ndp; cnp = fpl->cnp; if (__predict_false(len == 0)) { return (ENOENT); } if (__predict_false(len > MAXPATHLEN - 2)) { if (cache_fpl_istrailingslash(fpl)) { return (EAGAIN); } } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { return (ENAMETOOLONG); } if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { return (ELOOP); } adjust = len; if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); } else { if (cache_fpl_istrailingslash(fpl)) { adjust = len + 1; cnp->cn_pnbuf[len] = '/'; cnp->cn_pnbuf[len + 1] = '\0'; } else { cnp->cn_pnbuf[len] = '\0'; } } bcopy(string, cnp->cn_pnbuf, len); ndp->ni_pathlen += adjust; cache_fpl_pathlen_add(fpl, adjust); cnp->cn_nameptr = cnp->cn_pnbuf; fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl->tvp = NULL; return (0); } static int __noinline cache_fplookup_symlink(struct cache_fpl *fpl) { struct mount *mp; struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp, *tvp; struct pwd *pwd; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; tvp = fpl->tvp; pwd = *(fpl->pwd); if (cache_fpl_islastcn(ndp)) { if ((cnp->cn_flags & FOLLOW) == 0) { return (cache_fplookup_final(fpl)); } } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } /* * Note this check races against setting the flag just like regular * lookup. */ if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EACCES)); } error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); if (__predict_false(error != 0)) { switch (error) { case EAGAIN: return (cache_fpl_partial(fpl)); case ENOENT: case ENAMETOOLONG: case ELOOP: cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, error)); default: return (cache_fpl_aborted(fpl)); } } if (*(cnp->cn_nameptr) == '/') { fpl->dvp = cache_fpl_handle_root(fpl); fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } /* * The main loop assumes that ->dvp points to a vnode belonging * to a filesystem which can do lockless lookup, but the absolute * symlink can be wandering off to one which does not. */ mp = atomic_load_ptr(&fpl->dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (!cache_fplookup_mp_supported(mp)) { cache_fpl_checkpoint(fpl); return (cache_fpl_partial(fpl)); } if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) { return (cache_fpl_aborted(fpl)); } } return (0); } static int cache_fplookup_next(struct cache_fpl *fpl) { struct componentname *cnp; struct namecache *ncp; struct vnode *dvp, *tvp; u_char nc_flag; uint32_t hash; int error; cnp = fpl->cnp; dvp = fpl->dvp; hash = fpl->hash; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) { return (cache_fplookup_dot(fpl)); } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { return (cache_fplookup_dotdot(fpl)); } } MPASS(!cache_fpl_isdotdot(cnp)); ncp = cache_ncp_find(dvp, cnp, hash); if (__predict_false(ncp == NULL)) { return (cache_fplookup_noentry(fpl)); } tvp = atomic_load_ptr(&ncp->nc_vp); nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) != 0) { return (cache_fplookup_neg(fpl, ncp, hash)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_partial(fpl)); } fpl->tvp = tvp; fpl->tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } counter_u64_add(numposhits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static bool cache_fplookup_mp_supported(struct mount *mp) { MPASS(mp != NULL); if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return (false); return (true); } /* * Walk up the mount stack (if any). * * Correctness is provided in the following ways: * - all vnodes are protected from freeing with SMR * - struct mount objects are type stable making them always safe to access * - stability of the particular mount is provided by busying it * - relationship between the vnode which is mounted on and the mount is * verified with the vnode sequence counter after busying * - association between root vnode of the mount and the mount is protected * by busy * * From that point on we can read the sequence counter of the root vnode * and get the next mount on the stack (if any) using the same protection. * * By the end of successful walk we are guaranteed the reached state was * indeed present at least at some point which matches the regular lookup. */ static int __noinline cache_fplookup_climb_mount(struct cache_fpl *fpl) { struct mount *mp, *prev_mp; struct mount_pcpu *mpcpu, *prev_mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } prev_mp = NULL; for (;;) { if (!vfs_op_thread_enter_crit(mp, mpcpu)) { if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); return (cache_fpl_partial(fpl)); } if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } prev_mp = mp; prev_mpcpu = mpcpu; mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) break; } vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } static int __noinline cache_fplookup_cross_mount(struct cache_fpl *fpl) { struct mount *mp; struct mount_pcpu *mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } if (!vfs_op_thread_enter_crit(mp, mpcpu)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (__predict_false(vp == NULL)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); vfs_op_thread_exit_crit(mp, mpcpu); if (seqc_in_modify(vp_seqc)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp != NULL)) { /* * There are possibly more mount points on top. * Normally this does not happen so for simplicity just start * over. */ return (cache_fplookup_climb_mount(fpl)); } fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } /* * Check if a vnode is mounted on. */ static bool cache_fplookup_is_mp(struct cache_fpl *fpl) { struct vnode *vp; vp = fpl->tvp; return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); } /* * Parse the path. * * The code was originally copy-pasted from regular lookup and despite * clean ups leaves performance on the table. Any modifications here * must take into account that in case off fallback the resulting * nameidata state has to be compatible with the original. */ /* * Debug ni_pathlen tracking. */ #ifdef INVARIANTS static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen += n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen -= n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { cache_fpl_pathlen_add(fpl, 1); } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { cache_fpl_pathlen_sub(fpl, 1); } #else static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { } #endif static void cache_fplookup_parse(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp; char *cp; uint32_t hash; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; /* * Find the end of this path component, it is either / or nul. * * Store / as a temporary sentinel so that we only have one character * to test for. Pathnames tend to be short so this should not be * resulting in cache misses. * * TODO: fix this to be word-sized. */ MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], fpl->nulchar, cnp->cn_pnbuf)); KASSERT(*fpl->nulchar == '\0', ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, cnp->cn_pnbuf)); hash = cache_get_hash_iter_start(dvp); *fpl->nulchar = '/'; for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { KASSERT(*cp != '\0', ("%s: encountered unexpected nul; string [%s]\n", __func__, cnp->cn_nameptr)); hash = cache_get_hash_iter(*cp, hash); continue; } *fpl->nulchar = '\0'; fpl->hash = cache_get_hash_iter_finish(hash); cnp->cn_namelen = cp - cnp->cn_nameptr; cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); #ifdef INVARIANTS /* * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since * we are going to fail this lookup with ENAMETOOLONG (see below). */ if (cnp->cn_namelen <= NAME_MAX) { if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { panic("%s: mismatched hash for [%s] len %ld", __func__, cnp->cn_nameptr, cnp->cn_namelen); } } #endif /* * Hack: we have to check if the found path component's length exceeds * NAME_MAX. However, the condition is very rarely true and check can * be elided in the common case -- if an entry was found in the cache, * then it could not have been too long to begin with. */ ndp->ni_next = cp; } static void cache_fplookup_parse_advance(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; cnp->cn_nameptr = ndp->ni_next; KASSERT(*(cnp->cn_nameptr) == '/', ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } /* * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. * * Lockless lookup tries to elide checking for spurious slashes and should they * be present is guaranteed to fail to find an entry. In this case the caller * must check if the name starts with a slash and call this routine. It is * going to fast forward across the spurious slashes and set the state up for * retry. */ static int __noinline cache_fplookup_skip_slashes(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); /* * Go back to one slash so that cache_fplookup_parse_advance has * something to skip. */ cnp->cn_nameptr--; cache_fpl_pathlen_inc(fpl); /* * cache_fplookup_parse_advance starts from ndp->ni_next */ ndp->ni_next = cnp->cn_nameptr; /* * See cache_fplookup_dot. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; return (0); } /* * Handle trailing slashes (e.g., "foo/"). * * If a trailing slash is found the terminal vnode must be a directory. * Regular lookup shortens the path by nulifying the first trailing slash and * sets the TRAILINGSLASH flag to denote this took place. There are several * checks on it performed later. * * Similarly to spurious slashes, lockless lookup handles this in a speculative * manner relying on an invariant that a non-directory vnode will get a miss. * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. * * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" * and denotes this is the last path component, which avoids looping back. * * Only plain lookups are supported for now to restrict corner cases to handle. */ static int __noinline cache_fplookup_trailingslash(struct cache_fpl *fpl) { #ifdef INVARIANTS size_t ni_pathlen; #endif struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *tvp; char *cn_nameptr_orig, *cn_nameptr_slash; seqc_t tvp_seqc; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; MPASS(fpl->dvp == fpl->tvp); KASSERT(cache_fpl_istrailingslash(fpl), ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, cnp->cn_pnbuf)); KASSERT(cnp->cn_nameptr[0] == '\0', ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], cnp->cn_pnbuf)); KASSERT(cnp->cn_namelen == 0, ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, cnp->cn_pnbuf)); MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); if (cnp->cn_nameiop != LOOKUP) { return (cache_fpl_aborted(fpl)); } if (__predict_false(tvp->v_type != VDIR)) { if (!vn_seqc_consistent(tvp, tvp_seqc)) { return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } /* * Denote the last component. */ ndp->ni_next = &cnp->cn_nameptr[0]; MPASS(cache_fpl_islastcn(ndp)); /* * Unwind trailing slashes. */ cn_nameptr_orig = cnp->cn_nameptr; while (cnp->cn_nameptr >= cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] != '/') { break; } } /* * Unwind to the beginning of the path component. * * Note the path may or may not have started with a slash. */ cn_nameptr_slash = cnp->cn_nameptr; while (cnp->cn_nameptr > cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] == '/') { break; } } if (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; } cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); cache_fpl_checkpoint(fpl); #ifdef INVARIANTS ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; if (ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif /* * If this was a "./" lookup the parent directory is already correct. */ if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { return (0); } /* * Otherwise we need to look it up. */ tvp = fpl->tvp; ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); if (__predict_false(ncp == NULL)) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { return (cache_fpl_aborted(fpl)); } fpl->dvp = ncp->nc_dvp; fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } /* * See the API contract for VOP_FPLOOKUP_VEXEC. */ static int __noinline cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) { struct componentname *cnp; struct vnode *dvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; /* * Hack: delayed empty path checking. */ if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } /* * TODO: Due to ignoring trailing slashes lookup will perform a * permission check on the last dir when it should not be doing it. It * may fail, but said failure should be ignored. It is possible to fix * it up fully without resorting to regular lookup, but for now just * abort. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_aborted(fpl)); } /* * Hack: delayed degenerate path checking. */ if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } /* * Hack: they may be looking up foo/bar, where foo is not a directory. * In such a case we need to return ENOTDIR, but we may happen to get * here with a different error. */ if (dvp->v_type != VDIR) { error = ENOTDIR; } /* * Hack: handle O_SEARCH. * * Open Group Base Specifications Issue 7, 2018 edition states: * * If the access mode of the open file description associated with the * file descriptor is not O_SEARCH, the function shall check whether * directory searches are permitted using the current permissions of * the directory underlying the file descriptor. If the access mode is * O_SEARCH, the function shall not perform the check. * * * Regular lookup tests for the NOEXECCHECK flag for every path * component to decide whether to do the permission check. However, * since most lookups never have the flag (and when they do it is only * present for the first path component), lockless lookup only acts on * it if there is a permission problem. Here the flag is represented * with a boolean so that we don't have to clear it on the way out. * * For simplicity this always aborts. * TODO: check if this is the first lookup and ignore the permission * problem. Note the flag has to survive fallback (if it happens to be * performed). */ if (fpl->fsearch) { return (cache_fpl_aborted(fpl)); } switch (error) { case EAGAIN: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_partial(fpl); } break; default: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_smr_exit(fpl); cache_fpl_handled_error(fpl, error); } break; } return (error); } static int cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct mount *mp; int error; ndp = fpl->ndp; cnp = fpl->cnp; cache_fpl_checkpoint(fpl); /* * The vnode at hand is almost always stable, skip checking for it. * Worst case this postpones the check towards the end of the iteration * of the main loop. */ fpl->dvp = dvp; fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { return (cache_fpl_aborted(fpl)); } MPASS(fpl->tvp == NULL); for (;;) { cache_fplookup_parse(fpl); error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); if (__predict_false(error != 0)) { error = cache_fplookup_failed_vexec(fpl, error); break; } error = cache_fplookup_next(fpl); if (__predict_false(cache_fpl_terminated(fpl))) { break; } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (fpl->tvp->v_type == VLNK) { error = cache_fplookup_symlink(fpl); if (cache_fpl_terminated(fpl)) { break; } } else { if (cache_fpl_islastcn(ndp)) { error = cache_fplookup_final(fpl); break; } if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { error = cache_fpl_aborted(fpl); break; } fpl->dvp = fpl->tvp; fpl->dvp_seqc = fpl->tvp_seqc; cache_fplookup_parse_advance(fpl); } cache_fpl_checkpoint(fpl); } return (error); } /* * Fast path lookup protected with SMR and sequence counters. * * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria * outlined below. * * Traditional vnode lookup conceptually looks like this: * * vn_lock(current); * for (;;) { * next = find(); * vn_lock(next); * vn_unlock(current); * current = next; * if (last) * break; * } * return (current); * * Each jump to the next vnode is safe memory-wise and atomic with respect to * any modifications thanks to holding respective locks. * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as * we made the jump. This includes things like permissions, mount points etc. * Counter modification is provided by enclosing relevant places in * vn_seqc_write_begin()/end() calls. * * Thus this translates to: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode * abort(); * for (;;) { * tvp = find(); * tvp_seqc = seqc_read_any(tvp); * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode * abort(); * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode * abort(); * dvp = tvp; // we know nothing of importance has changed * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check * abort(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant * return (tvp); * * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: * - they are called while within vfs_smr protection which they must never exit * - EAGAIN can be returned to denote checking could not be performed, it is * always valid to return it * - if the sequence counter has not changed the result must be valid * - if the sequence counter has changed both false positives and false negatives * are permitted (since the result will be rejected later) * - for simple cases of unix permission checks vaccess_vexec_smr can be used * * Caveats to watch out for: * - vnodes are passed unlocked and unreferenced with nothing stopping * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised * to use atomic_load_ptr to fetch it. * - the aforementioned object can also get freed, meaning absent other means it * should be protected with vfs_smr * - either safely checking permissions as they are modified or guaranteeing * their stability is left to the routine */ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp) { struct cache_fpl fpl; struct pwd *pwd; struct vnode *dvp; struct componentname *cnp; int error; fpl.status = CACHE_FPL_STATUS_UNSET; fpl.in_smr = false; fpl.ndp = ndp; fpl.cnp = cnp = &ndp->ni_cnd; MPASS(ndp->ni_lcf == 0); KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, ("%s: internal flags found in cn_flags %" PRIx64, __func__, cnp->cn_flags)); MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); MPASS(ndp->ni_resflags == 0); if (__predict_false(!cache_can_fplookup(&fpl))) { *status = fpl.status; SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); return (EOPNOTSUPP); } cache_fpl_checkpoint_outer(&fpl); cache_fpl_smr_enter_initial(&fpl); #ifdef INVARIANTS fpl.debug.ni_pathlen = ndp->ni_pathlen; #endif fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl.fsearch = false; fpl.tvp = NULL; /* for degenerate path handling */ fpl.pwd = pwdp; pwd = pwd_get_smr(); *(fpl.pwd) = pwd; namei_setup_rootdir(ndp, cnp, pwd); ndp->ni_topdir = pwd->pwd_jdir; if (cnp->cn_pnbuf[0] == '/') { dvp = cache_fpl_handle_root(&fpl); ndp->ni_resflags = NIRES_ABS; } else { if (ndp->ni_dirfd == AT_FDCWD) { dvp = pwd->pwd_cdir; } else { error = cache_fplookup_dirfd(&fpl, &dvp); if (__predict_false(error != 0)) { goto out; } } } SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); error = cache_fplookup_impl(dvp, &fpl); out: cache_fpl_smr_assert_not_entered(&fpl); cache_fpl_assert_status(&fpl); *status = fpl.status; if (SDT_PROBES_ENABLED()) { SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); if (fpl.status == CACHE_FPL_STATUS_HANDLED) SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, ndp); } if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { MPASS(error != CACHE_FPL_FAILED); if (error != 0) { cache_fpl_cleanup_cnp(fpl.cnp); MPASS(fpl.dvp == NULL); MPASS(fpl.tvp == NULL); } ndp->ni_dvp = fpl.dvp; ndp->ni_vp = fpl.tvp; } return (error); }