diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index 7607b44e36c3..c05e3394a9a6 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -1,1163 +1,1181 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Ancestors: * ...and... */ /* * Null Layer * * (See mount_nullfs(8) for more information.) * * The null layer duplicates a portion of the filesystem * name space under a new name. In this respect, it is * similar to the loopback filesystem. It differs from * the loopback fs in two respects: it is implemented using * a stackable layers techniques, and its "null-node"s stack above * all lower-layer vnodes, not just over directory vnodes. * * The null layer has two purposes. First, it serves as a demonstration * of layering by proving a layer which does nothing. (It actually * does everything the loopback filesystem does, which is slightly * more than nothing.) Second, the null layer can serve as a prototype * layer. Since it provides all necessary layer framework, * new filesystem layers can be created very easily be starting * with a null layer. * * The remainder of this man page examines the null layer as a basis * for constructing new layers. * * * INSTANTIATING NEW NULL LAYERS * * New null layers are created with mount_nullfs(8). * Mount_nullfs(8) takes two arguments, the pathname * of the lower vfs (target-pn) and the pathname where the null * layer will appear in the namespace (alias-pn). After * the null layer is put into place, the contents * of target-pn subtree will be aliased under alias-pn. * * * OPERATION OF A NULL LAYER * * The null layer is the minimum filesystem layer, * simply bypassing all possible operations to the lower layer * for processing there. The majority of its activity centers * on the bypass routine, through which nearly all vnode operations * pass. * * The bypass routine accepts arbitrary vnode operations for * handling by the lower layer. It begins by examining vnode * operation arguments and replacing any null-nodes by their * lower-layer equivlants. It then invokes the operation * on the lower layer. Finally, it replaces the null-nodes * in the arguments and, if a vnode is return by the operation, * stacks a null-node on top of the returned vnode. * * Although bypass handles most operations, vop_getattr, vop_lock, * vop_unlock, vop_inactive, vop_reclaim, and vop_print are not * bypassed. Vop_getattr must change the fsid being returned. * Vop_lock and vop_unlock must handle any locking for the * current vnode as well as pass the lock request down. * Vop_inactive and vop_reclaim are not bypassed so that * they can handle freeing null-layer specific data. Vop_print * is not bypassed to avoid excessive debugging information. * Also, certain vnode operations change the locking state within * the operation (create, mknod, remove, link, rename, mkdir, rmdir, * and symlink). Ideally these operations should not change the * lock state, but should be changed to let the caller of the * function unlock them. Otherwise all intermediate vnode layers * (such as union, umapfs, etc) must catch these functions to do * the necessary locking at their layer. * * * INSTANTIATING VNODE STACKS * * Mounting associates the null layer with a lower layer, * effect stacking two VFSes. Vnode stacks are instead * created on demand as files are accessed. * * The initial mount creates a single vnode stack for the * root of the new null layer. All other vnode stacks * are created as a result of vnode operations on * this or other null vnode stacks. * * New vnode stacks come into existence as a result of * an operation which returns a vnode. * The bypass routine stacks a null-node above the new * vnode before returning it to the caller. * * For example, imagine mounting a null layer with * "mount_nullfs /usr/include /dev/layer/null". * Changing directory to /dev/layer/null will assign * the root null-node (which was created when the null layer was mounted). * Now consider opening "sys". A vop_lookup would be * done on the root null-node. This operation would bypass through * to the lower layer which would return a vnode representing * the UFS "sys". Null_bypass then builds a null-node * aliasing the UFS "sys" and returns this to the caller. * Later operations on the null-node "sys" will repeat this * process when constructing other vnode stacks. * * * CREATING OTHER FILE SYSTEM LAYERS * * One of the easiest ways to construct new filesystem layers is to make * a copy of the null layer, rename all files and variables, and * then begin modifing the copy. Sed can be used to easily rename * all variables. * * The umap layer is an example of a layer descended from the * null layer. * * * INVOKING OPERATIONS ON LOWER LAYERS * * There are two techniques to invoke operations on a lower layer * when the operation cannot be completely bypassed. Each method * is appropriate in different situations. In both cases, * it is the responsibility of the aliasing layer to make * the operation arguments "correct" for the lower layer * by mapping a vnode arguments to the lower layer. * * The first approach is to call the aliasing layer's bypass routine. * This method is most suitable when you wish to invoke the operation * currently being handled on the lower layer. It has the advantage * that the bypass routine already must do argument mapping. * An example of this is null_getattrs in the null layer. * * A second approach is to directly invoke vnode operations on * the lower layer with the VOP_OPERATIONNAME interface. * The advantage of this method is that it is easy to invoke * arbitrary operations on the lower layer. The disadvantage * is that vnode arguments must be manualy mapped. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, &null_bug_bypass, 0, ""); /* * This is the 10-Apr-92 bypass routine. * This version has been optimized for speed, throwing away some * safety checks. It should still always work, but it's not as * robust to programmer errors. * * In general, we map all vnodes going down and unmap them on the way back. * As an exception to this, vnodes can be marked "unmapped" by setting * the Nth bit in operation's vdesc_flags. * * Also, some BSD vnode operations have the side effect of vrele'ing * their arguments. With stacking, the reference counts are held * by the upper node, not the lower one, so we must handle these * side-effects here. This is not of concern in Sun-derived systems * since there are no such side-effects. * * This makes the following assumptions: * - only one returned vpp * - no INOUT vpp's (Sun's vop_open has one of these) * - the vnode operation vector of the first vnode should be used * to determine what implementation of the op should be invoked * - all mapped vnodes are of our vnode-type (NEEDSWORK: * problems on rmdir'ing mount points and renaming?) */ int null_bypass(struct vop_generic_args *ap) { struct vnode **this_vp_p; struct vnode *old_vps[VDESC_MAX_VPS]; struct vnode **vps_p[VDESC_MAX_VPS]; struct vnode ***vppp; struct vnode *lvp; struct vnodeop_desc *descp = ap->a_desc; int error, i, reles; if (null_bug_bypass) printf ("null_bypass: %s\n", descp->vdesc_name); #ifdef DIAGNOSTIC /* * We require at least one vp. */ if (descp->vdesc_vp_offsets == NULL || descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) panic ("null_bypass: no vp's in map"); #endif /* * Map the vnodes going in. * Later, we'll invoke the operation based on * the first mapped vnode's operation vector. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ vps_p[i] = this_vp_p = VOPARG_OFFSETTO(struct vnode **, descp->vdesc_vp_offsets[i], ap); /* * We're not guaranteed that any but the first vnode * are of our type. Check for and don't map any * that aren't. (We must always map first vp or vclean fails.) */ if (i != 0 && (*this_vp_p == NULLVP || (*this_vp_p)->v_op != &null_vnodeops)) { old_vps[i] = NULLVP; } else { old_vps[i] = *this_vp_p; *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); /* * The upper vnode reference to the lower * vnode is the only reference that keeps our * pointer to the lower vnode alive. If lower * vnode is relocked during the VOP call, * upper vnode might become unlocked and * reclaimed, which invalidates our reference. * Add a transient hold around VOP call. */ vhold(*this_vp_p); /* * XXX - Several operations have the side effect * of vrele'ing their vp's. We must account for * that. (This should go away in the future.) */ if (reles & VDESC_VP0_WILLRELE) vref(*this_vp_p); } } /* * Call the operation on the lower layer * with the modified argument structure. */ if (vps_p[0] != NULL && *vps_p[0] != NULL) { error = VCALL(ap); } else { printf("null_bypass: no map for %s\n", descp->vdesc_name); error = EINVAL; } /* * Maintain the illusion of call-by-value * by restoring vnodes in the argument structure * to their original value. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ if (old_vps[i] != NULL) { lvp = *(vps_p[i]); /* * Get rid of the transient hold on lvp. * If lowervp was unlocked during VOP * operation, nullfs upper vnode could have * been reclaimed, which changes its v_vnlock * back to private v_lock. In this case we * must move lock ownership from lower to * upper (reclaimed) vnode. */ if (lvp != NULLVP) { if (VOP_ISLOCKED(lvp) == LK_EXCLUSIVE && old_vps[i]->v_vnlock != lvp->v_vnlock) { VOP_UNLOCK(lvp); VOP_LOCK(old_vps[i], LK_EXCLUSIVE | LK_RETRY); } vdrop(lvp); } *(vps_p[i]) = old_vps[i]; #if 0 if (reles & VDESC_VP0_WILLUNLOCK) VOP_UNLOCK(*(vps_p[i]), 0); #endif if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i])); } } /* * Map the possible out-going vpp * (Assumes that the lower layer always returns * a VREF'ed vpp unless it gets an error.) */ if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && error == 0) { /* * XXX - even though some ops have vpp returned vp's, * several ops actually vrele this before returning. * We must avoid these ops. * (This should go away when these ops are regularized.) */ vppp = VOPARG_OFFSETTO(struct vnode ***, descp->vdesc_vpp_offset, ap); if (*vppp != NULL) error = null_nodeget(old_vps[0]->v_mount, **vppp, *vppp); } return (error); } static int null_add_writecount(struct vop_add_writecount_args *ap) { struct vnode *lvp, *vp; int error; vp = ap->a_vp; lvp = NULLVPTOLOWERVP(vp); VI_LOCK(vp); /* text refs are bypassed to lowervp */ VNASSERT(vp->v_writecount >= 0, vp, ("wrong null writecount")); VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("wrong writecount inc %d", ap->a_inc)); error = VOP_ADD_WRITECOUNT(lvp, ap->a_inc); if (error == 0) vp->v_writecount += ap->a_inc; VI_UNLOCK(vp); return (error); } /* * We have to carry on the locking protocol on the null layer vnodes * as we progress through the tree. We also have to enforce read-only * if this layer is mounted read-only. */ static int null_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; int flags = cnp->cn_flags; struct vnode *vp, *ldvp, *lvp; struct mount *mp; int error; mp = dvp->v_mount; if ((flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); /* * Although it is possible to call null_bypass(), we'll do * a direct call to reduce overhead */ ldvp = NULLVPTOLOWERVP(dvp); vp = lvp = NULL; /* * Renames in the lower mounts might create an inconsistent * configuration where lower vnode is moved out of the * directory tree remounted by our null mount. Do not try to * handle it fancy, just avoid VOP_LOOKUP() with DOTDOT name * which cannot be handled by VOP, at least passing over lower * root. */ if ((ldvp->v_vflag & VV_ROOT) != 0 && (flags & ISDOTDOT) != 0) { KASSERT((dvp->v_vflag & VV_ROOT) == 0, ("ldvp %p fl %#x dvp %p fl %#x flags %#x", ldvp, ldvp->v_vflag, dvp, dvp->v_vflag, flags)); return (ENOENT); } /* * Hold ldvp. The reference on it, owned by dvp, is lost in * case of dvp reclamation, and we need ldvp to move our lock * from ldvp to dvp. */ vhold(ldvp); error = VOP_LOOKUP(ldvp, &lvp, cnp); /* * VOP_LOOKUP() on lower vnode may unlock ldvp, which allows * dvp to be reclaimed due to shared v_vnlock. Check for the * doomed state and return error. */ if (VN_IS_DOOMED(dvp)) { if (error == 0 || error == EJUSTRETURN) { if (lvp != NULL) vput(lvp); error = ENOENT; } /* * If vgone() did reclaimed dvp before curthread * relocked ldvp, the locks of dvp and ldpv are no * longer shared. In this case, relock of ldvp in * lower fs VOP_LOOKUP() does not restore the locking * state of dvp. Compensate for this by unlocking * ldvp and locking dvp, which is also correct if the * locks are still shared. */ VOP_UNLOCK(ldvp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } vdrop(ldvp); if (error == EJUSTRETURN && (flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { if (ldvp == lvp) { *ap->a_vpp = dvp; VREF(dvp); vrele(lvp); } else { error = null_nodeget(mp, lvp, &vp); if (error == 0) *ap->a_vpp = vp; } } return (error); } static int null_open(struct vop_open_args *ap) { int retval; struct vnode *vp, *ldvp; vp = ap->a_vp; ldvp = NULLVPTOLOWERVP(vp); retval = null_bypass(&ap->a_gen); if (retval == 0) { vp->v_object = ldvp->v_object; if ((vn_irflag_read(ldvp) & VIRF_PGREAD) != 0) { MPASS(vp->v_object != NULL); if ((vn_irflag_read(vp) & VIRF_PGREAD) == 0) { vn_irflag_set_cond(vp, VIRF_PGREAD); } } } return (retval); } /* * Setattr call. Disallow write attempts if the layer is mounted read-only. */ static int null_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); return (0); case VREG: case VLNK: default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); } } return (null_bypass((struct vop_generic_args *)ap)); } /* * We handle stat and getattr only to change the fsid. */ static int null_stat(struct vop_stat_args *ap) { int error; if ((error = null_bypass((struct vop_generic_args *)ap)) != 0) return (error); ap->a_sb->st_dev = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; return (0); } static int null_getattr(struct vop_getattr_args *ap) { int error; if ((error = null_bypass((struct vop_generic_args *)ap)) != 0) return (error); ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; return (0); } /* * Handle to disallow write access if mounted read-only. */ static int null_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; /* * Disallow write attempts on read-only layers; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (null_bypass((struct vop_generic_args *)ap)); } static int null_accessx(struct vop_accessx_args *ap) { struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; /* * Disallow write attempts on read-only layers; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (null_bypass((struct vop_generic_args *)ap)); } /* * Increasing refcount of lower vnode is needed at least for the case * when lower FS is NFS to do sillyrename if the file is in use. * Unfortunately v_usecount is incremented in many places in * the kernel and, as such, there may be races that result in * the NFS client doing an extraneous silly rename, but that seems * preferable to not doing a silly rename when it is needed. */ static int null_remove(struct vop_remove_args *ap) { int retval, vreleit; struct vnode *lvp, *vp; vp = ap->a_vp; if (vrefcnt(vp) > 1) { lvp = NULLVPTOLOWERVP(vp); VREF(lvp); vreleit = 1; } else vreleit = 0; VTONULL(vp)->null_flags |= NULLV_DROP; retval = null_bypass(&ap->a_gen); if (vreleit != 0) vrele(lvp); return (retval); } /* * We handle this to eliminate null FS to lower FS * file moving. Don't know why we don't allow this, * possibly we should. */ static int null_rename(struct vop_rename_args *ap) { struct vnode *fdvp, *fvp, *tdvp, *tvp; struct vnode *lfdvp, *lfvp, *ltdvp, *ltvp; struct null_node *fdnn, *fnn, *tdnn, *tnn; int error; tdvp = ap->a_tdvp; fvp = ap->a_fvp; fdvp = ap->a_fdvp; tvp = ap->a_tvp; lfdvp = NULL; /* Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp != NULL && fvp->v_mount != tvp->v_mount)) { error = EXDEV; goto upper_err; } VI_LOCK(fdvp); fdnn = VTONULL(fdvp); if (fdnn == NULL) { /* fdvp is not locked, can be doomed */ VI_UNLOCK(fdvp); error = ENOENT; goto upper_err; } lfdvp = fdnn->null_lowervp; vref(lfdvp); VI_UNLOCK(fdvp); VI_LOCK(fvp); fnn = VTONULL(fvp); if (fnn == NULL) { VI_UNLOCK(fvp); error = ENOENT; goto upper_err; } lfvp = fnn->null_lowervp; vref(lfvp); VI_UNLOCK(fvp); tdnn = VTONULL(tdvp); ltdvp = tdnn->null_lowervp; vref(ltdvp); if (tvp != NULL) { tnn = VTONULL(tvp); ltvp = tnn->null_lowervp; vref(ltvp); tnn->null_flags |= NULLV_DROP; } else { ltvp = NULL; } error = VOP_RENAME(lfdvp, lfvp, ap->a_fcnp, ltdvp, ltvp, ap->a_tcnp); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); upper_err: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); if (lfdvp != NULL) vrele(lfdvp); vrele(fdvp); vrele(fvp); return (error); } static int null_rmdir(struct vop_rmdir_args *ap) { VTONULL(ap->a_vp)->null_flags |= NULLV_DROP; return (null_bypass(&ap->a_gen)); } /* * We need to process our own vnode lock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int null_lock(struct vop_lock1_args *ap) { struct vnode *vp = ap->a_vp; int flags; struct null_node *nn; struct vnode *lvp; int error; if ((ap->a_flags & LK_INTERLOCK) == 0) VI_LOCK(vp); else ap->a_flags &= ~LK_INTERLOCK; flags = ap->a_flags; nn = VTONULL(vp); /* * If we're still active we must ask the lower layer to * lock as ffs has special lock considerations in its * vop lock. */ if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { /* * We have to hold the vnode here to solve a potential * reclaim race. If we're forcibly vgone'd while we * still have refs, a thread could be sleeping inside * the lowervp's vop_lock routine. When we vgone we will * drop our last ref to the lowervp, which would allow it * to be reclaimed. The lowervp could then be recycled, * in which case it is not legal to be sleeping in its VOP. * We prevent it from being recycled by holding the vnode * here. */ vholdnz(lvp); VI_UNLOCK(vp); error = VOP_LOCK(lvp, flags); /* * We might have slept to get the lock and someone might have * clean our vnode already, switching vnode lock from one in * lowervp to v_lock in our own vnode structure. Handle this * case by reacquiring correct lock in requested mode. */ if (VTONULL(vp) == NULL && error == 0) { ap->a_flags &= ~LK_TYPE_MASK; switch (flags & LK_TYPE_MASK) { case LK_SHARED: ap->a_flags |= LK_SHARED; break; case LK_UPGRADE: case LK_EXCLUSIVE: ap->a_flags |= LK_EXCLUSIVE; break; default: panic("Unsupported lock request %d\n", ap->a_flags); } VOP_UNLOCK(lvp); error = vop_stdlock(ap); } vdrop(lvp); } else { VI_UNLOCK(vp); error = vop_stdlock(ap); } return (error); } /* * We need to process our own vnode unlock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int null_unlock(struct vop_unlock_args *ap) { struct vnode *vp = ap->a_vp; struct null_node *nn; struct vnode *lvp; int error; nn = VTONULL(vp); if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { vholdnz(lvp); error = VOP_UNLOCK(lvp); vdrop(lvp); } else { error = vop_stdunlock(ap); } return (error); } /* * Do not allow the VOP_INACTIVE to be passed to the lower layer, * since the reference count on the lower vnode is not related to * ours. */ static int null_want_recycle(struct vnode *vp) { struct vnode *lvp; struct null_node *xp; struct mount *mp; struct null_mount *xmp; xp = VTONULL(vp); lvp = NULLVPTOLOWERVP(vp); mp = vp->v_mount; xmp = MOUNTTONULLMOUNT(mp); if ((xmp->nullm_flags & NULLM_CACHE) == 0 || (xp->null_flags & NULLV_DROP) != 0 || (lvp->v_vflag & VV_NOSYNC) != 0) { /* * If this is the last reference and caching of the * nullfs vnodes is not enabled, or the lower vnode is * deleted, then free up the vnode so as not to tie up * the lower vnodes. */ return (1); } return (0); } static int null_inactive(struct vop_inactive_args *ap) { struct vnode *vp; vp = ap->a_vp; if (null_want_recycle(vp)) { vp->v_object = NULL; vrecycle(vp); } return (0); } static int null_need_inactive(struct vop_need_inactive_args *ap) { return (null_want_recycle(ap->a_vp) || vn_need_pageq_flush(ap->a_vp)); } /* * Now, the nullfs vnode and, due to the sharing lock, the lower * vnode, are exclusively locked, and we shall destroy the null vnode. */ static int null_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct null_node *xp; struct vnode *lowervp; vp = ap->a_vp; xp = VTONULL(vp); lowervp = xp->null_lowervp; KASSERT(lowervp != NULL && vp->v_vnlock != &vp->v_lock, ("Reclaiming incomplete null vnode %p", vp)); null_hashrem(xp); /* * Use the interlock to protect the clearing of v_data to * prevent faults in null_lock(). */ lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); VI_LOCK(vp); vp->v_data = NULL; vp->v_object = NULL; vp->v_vnlock = &vp->v_lock; /* * If we were opened for write, we leased the write reference * to the lower vnode. If this is a reclamation due to the * forced unmount, undo the reference now. */ if (vp->v_writecount > 0) VOP_ADD_WRITECOUNT(lowervp, -vp->v_writecount); else if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); if ((xp->null_flags & NULLV_NOUNLOCK) != 0) vunref(lowervp); else vput(lowervp); free(xp, M_NULLFSNODE); return (0); } static int null_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; printf("\tvp=%p, lowervp=%p\n", vp, VTONULL(vp)->null_lowervp); return (0); } /* ARGSUSED */ static int null_getwritemount(struct vop_getwritemount_args *ap) { struct null_node *xp; struct vnode *lowervp; struct vnode *vp; vp = ap->a_vp; VI_LOCK(vp); xp = VTONULL(vp); if (xp && (lowervp = xp->null_lowervp)) { vholdnz(lowervp); VI_UNLOCK(vp); VOP_GETWRITEMOUNT(lowervp, ap->a_mpp); vdrop(lowervp); } else { VI_UNLOCK(vp); *(ap->a_mpp) = NULL; } return (0); } static int null_vptofh(struct vop_vptofh_args *ap) { struct vnode *lvp; lvp = NULLVPTOLOWERVP(ap->a_vp); return VOP_VPTOFH(lvp, ap->a_fhp); } static int null_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct vnode *lvp, *ldvp; struct mount *mp; int error, locked; locked = VOP_ISLOCKED(vp); lvp = NULLVPTOLOWERVP(vp); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) return (error); vhold(lvp); VOP_UNLOCK(vp); /* vp is held by vn_vptocnp_locked that called us */ ldvp = lvp; vref(lvp); error = vn_vptocnp(&ldvp, ap->a_buf, ap->a_buflen); vdrop(lvp); if (error != 0) { vn_lock(vp, locked | LK_RETRY); vfs_unbusy(mp); return (ENOENT); } error = vn_lock(ldvp, LK_SHARED); if (error != 0) { vrele(ldvp); vn_lock(vp, locked | LK_RETRY); vfs_unbusy(mp); return (ENOENT); } error = null_nodeget(mp, ldvp, dvp); if (error == 0) { #ifdef DIAGNOSTIC NULLVPTOLOWERVP(*dvp); #endif VOP_UNLOCK(*dvp); /* keep reference on *dvp */ } vn_lock(vp, locked | LK_RETRY); vfs_unbusy(mp); return (error); } static int null_read_pgcache(struct vop_read_pgcache_args *ap) { struct vnode *lvp, *vp; struct null_node *xp; int error; vp = ap->a_vp; VI_LOCK(vp); xp = VTONULL(vp); if (xp == NULL) { VI_UNLOCK(vp); return (EJUSTRETURN); } lvp = xp->null_lowervp; vref(lvp); VI_UNLOCK(vp); error = VOP_READ_PGCACHE(lvp, ap->a_uio, ap->a_ioflag, ap->a_cred); vrele(lvp); return (error); } static int null_advlock(struct vop_advlock_args *ap) { struct vnode *lvp, *vp; struct null_node *xp; int error; vp = ap->a_vp; VI_LOCK(vp); xp = VTONULL(vp); if (xp == NULL) { VI_UNLOCK(vp); return (EBADF); } lvp = xp->null_lowervp; vref(lvp); VI_UNLOCK(vp); error = VOP_ADVLOCK(lvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags); vrele(lvp); return (error); } /* * Avoid standard bypass, since lower dvp and vp could be no longer * valid after vput(). */ static int null_vput_pair(struct vop_vput_pair_args *ap) { struct mount *mp; struct vnode *dvp, *ldvp, *lvp, *vp, *vp1, **vpp; int error, res; dvp = ap->a_dvp; ldvp = NULLVPTOLOWERVP(dvp); vref(ldvp); vpp = ap->a_vpp; vp = NULL; lvp = NULL; mp = NULL; if (vpp != NULL) vp = *vpp; if (vp != NULL) { lvp = NULLVPTOLOWERVP(vp); vref(lvp); if (!ap->a_unlock_vp) { vhold(vp); vhold(lvp); mp = vp->v_mount; vfs_ref(mp); } } res = VOP_VPUT_PAIR(ldvp, lvp != NULL ? &lvp : NULL, true); if (vp != NULL && ap->a_unlock_vp) vrele(vp); vrele(dvp); if (vp == NULL || ap->a_unlock_vp) return (res); /* lvp has been unlocked and vp might be reclaimed */ VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_data == NULL && vfs_busy(mp, MBF_NOWAIT) == 0) { vput(vp); vget(lvp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(lvp)) { vput(lvp); vget(vp, LK_EXCLUSIVE | LK_RETRY); } else { error = null_nodeget(mp, lvp, &vp1); if (error == 0) { *vpp = vp1; } else { vget(vp, LK_EXCLUSIVE | LK_RETRY); } } vfs_unbusy(mp); } vdrop(lvp); vdrop(vp); vfs_rel(mp); return (res); } +static int +null_getlowvnode(struct vop_getlowvnode_args *ap) +{ + struct vnode *vp, *vpl; + + vp = ap->a_vp; + if (vn_lock(vp, LK_SHARED) != 0) + return (EBADF); + + vpl = NULLVPTOLOWERVP(vp); + vhold(vpl); + VOP_UNLOCK(vp); + VOP_GETLOWVNODE(vpl, ap->a_vplp, ap->a_flags); + vdrop(vpl); + return (0); +} + /* * Global vfs data structures */ struct vop_vector null_vnodeops = { .vop_bypass = null_bypass, .vop_access = null_access, .vop_accessx = null_accessx, .vop_advlock = null_advlock, .vop_advlockpurge = vop_stdadvlockpurge, .vop_bmap = VOP_EOPNOTSUPP, .vop_stat = null_stat, .vop_getattr = null_getattr, + .vop_getlowvnode = null_getlowvnode, .vop_getwritemount = null_getwritemount, .vop_inactive = null_inactive, .vop_need_inactive = null_need_inactive, .vop_islocked = vop_stdislocked, .vop_lock1 = null_lock, .vop_lookup = null_lookup, .vop_open = null_open, .vop_print = null_print, .vop_read_pgcache = null_read_pgcache, .vop_reclaim = null_reclaim, .vop_remove = null_remove, .vop_rename = null_rename, .vop_rmdir = null_rmdir, .vop_setattr = null_setattr, .vop_strategy = VOP_EOPNOTSUPP, .vop_unlock = null_unlock, .vop_vptocnp = null_vptocnp, .vop_vptofh = null_vptofh, .vop_add_writecount = null_add_writecount, .vop_vput_pair = null_vput_pair, }; VFS_VOP_VECTOR_REGISTER(null_vnodeops); diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index a342cbb80c9c..67c7ace5f72f 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -1,1609 +1,1619 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vop_nolookup(struct vop_lookup_args *); static int vop_norename(struct vop_rename_args *); static int vop_nostrategy(struct vop_strategy_args *); static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td); static int vop_stdis_text(struct vop_is_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap); static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap); static int vop_stdstat(struct vop_stat_args *ap); static int vop_stdvput_pair(struct vop_vput_pair_args *ap); +static int vop_stdgetlowvnode(struct vop_getlowvnode_args *ap); /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. * * If there is no specific entry here, we will return EOPNOTSUPP. * * Note that every filesystem has to implement either vop_access * or vop_accessx; failing to do so will result in immediate crash * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(), * which calls vop_stdaccess() etc. */ struct vop_vector default_vnodeops = { .vop_default = NULL, .vop_bypass = VOP_EOPNOTSUPP, .vop_access = vop_stdaccess, .vop_accessx = vop_stdaccessx, .vop_advise = vop_stdadvise, .vop_advlock = vop_stdadvlock, .vop_advlockasync = vop_stdadvlockasync, .vop_advlockpurge = vop_stdadvlockpurge, .vop_allocate = vop_stdallocate, .vop_deallocate = vop_stddeallocate, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, .vop_stat = vop_stdstat, .vop_fdatasync = vop_stdfdatasync, + .vop_getlowvnode = vop_stdgetlowvnode, .vop_getpages = vop_stdgetpages, .vop_getpages_async = vop_stdgetpages_async, .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_need_inactive = vop_stdneed_inactive, .vop_ioctl = vop_stdioctl, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, .vop_lock1 = vop_stdlock, .vop_lookup = vop_nolookup, .vop_open = VOP_NULL, .vop_pathconf = VOP_EINVAL, .vop_poll = vop_nopoll, .vop_putpages = vop_stdputpages, .vop_readlink = VOP_EINVAL, .vop_read_pgcache = vop_stdread_pgcache, .vop_rename = vop_norename, .vop_revoke = VOP_PANIC, .vop_strategy = vop_nostrategy, .vop_unlock = vop_stdunlock, .vop_vptocnp = vop_stdvptocnp, .vop_vptofh = vop_stdvptofh, .vop_unp_bind = vop_stdunp_bind, .vop_unp_connect = vop_stdunp_connect, .vop_unp_detach = vop_stdunp_detach, .vop_is_text = vop_stdis_text, .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, .vop_copy_file_range = vop_stdcopy_file_range, .vop_vput_pair = vop_stdvput_pair, }; VFS_VOP_VECTOR_REGISTER(default_vnodeops); /* * Series of placeholder functions for various error returns for * VOPs. */ int vop_eopnotsupp(struct vop_generic_args *ap) { /* printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); */ return (EOPNOTSUPP); } int vop_ebadf(struct vop_generic_args *ap) { return (EBADF); } int vop_enotty(struct vop_generic_args *ap) { return (ENOTTY); } int vop_einval(struct vop_generic_args *ap) { return (EINVAL); } int vop_enoent(struct vop_generic_args *ap) { return (ENOENT); } int vop_eagain(struct vop_generic_args *ap) { return (EAGAIN); } int vop_null(struct vop_generic_args *ap) { return (0); } /* * Helper function to panic on some bad VOPs in some filesystems. */ int vop_panic(struct vop_generic_args *ap) { panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); } /* * vop_std and vop_no are default functions for use by * filesystems that need the "default reasonable" implementation for a * particular operation. * * The documentation for the operations they implement exists (if it exists) * in the VOP_(9) manpage (all uppercase). */ /* * Default vop for filesystems that do not support name lookup */ static int vop_nolookup(struct vop_lookup_args *ap) { *ap->a_vpp = NULL; return (ENOTDIR); } /* * vop_norename: * * Handle unlock and reference counting for arguments of vop_rename * for filesystems that do not implement rename operation. */ static int vop_norename(struct vop_rename_args *ap) { vop_rename_fail(ap); return (EOPNOTSUPP); } /* * vop_nostrategy: * * Strategy routine for VFS devices that have none. * * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy * routine. Typically this is done for a BIO_READ strategy call. * Typically B_INVAL is assumed to already be clear prior to a write * and should not be cleared manually unless you just made the buffer * invalid. BIO_ERROR should be cleared either way. */ static int vop_nostrategy (struct vop_strategy_args *ap) { printf("No strategy for buffer at %p\n", ap->a_bp); vn_printf(ap->a_vp, "vnode "); ap->a_bp->b_ioflags |= BIO_ERROR; ap->a_bp->b_error = EOPNOTSUPP; bufdone(ap->a_bp); return (EOPNOTSUPP); } /* * Check if a named file exists in a given directory vnode * * Returns 0 if the file exists, ENOENT if it doesn't, or errors returned by * vn_dir_next_dirent(). */ static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td) { char *dirbuf; int error, eofflag; size_t dirbuflen, len; off_t off; struct dirent *dp; struct vattr va; ASSERT_VOP_LOCKED(vp, "vnode not locked"); KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) return (error); dirbuflen = MAX(DEV_BSIZE, GENERIC_MAXDIRSIZ); if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); len = 0; off = 0; eofflag = 0; for (;;) { error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen, &dp, &len, &off, &eofflag); if (error != 0) goto out; if (len == 0) break; if (dp->d_type != DT_WHT && dp->d_fileno != 0 && strcmp(dp->d_name, dirname) == 0) goto out; } error = ENOENT; out: free(dirbuf, M_TEMP); return (error); } int vop_stdaccess(struct vop_access_args *ap) { KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td)); } int vop_stdaccessx(struct vop_accessx_args *ap) { int error; accmode_t accmode = ap->a_accmode; error = vfs_unixify_accmode(&accmode); if (error != 0) return (error); if (accmode == 0) return (0); return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td)); } /* * Advisory record locking support */ int vop_stdadvlock(struct vop_advlock_args *ap) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; vp = ap->a_vp; /* * Provide atomicity of open(O_CREAT | O_EXCL | O_EXLOCK) for * local filesystems. See vn_open_cred() for reciprocal part. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_LOCAL) != 0 && ap->a_op == F_SETLK && (ap->a_flags & F_FIRSTOPEN) == 0) { VI_LOCK(vp); while ((vp->v_iflag & VI_FOPENING) != 0) msleep(vp, VI_MTX(vp), PLOCK, "lockfo", 0); VI_UNLOCK(vp); } if (ap->a_fl->l_whence == SEEK_END) { /* * The NFSv4 server must avoid doing a vn_lock() here, since it * can deadlock the nfsd threads, due to a LOR. Fortunately * the NFSv4 server always uses SEEK_SET and this code is * only required for the SEEK_END case. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp); if (error) return (error); } else vattr.va_size = 0; return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockasync(struct vop_advlockasync_args *ap) { struct vnode *vp; struct vattr vattr; int error; vp = ap->a_vp; if (ap->a_fl->l_whence == SEEK_END) { /* The size argument is only needed for SEEK_END. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp); if (error) return (error); } else vattr.va_size = 0; return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap) { struct vnode *vp; vp = ap->a_vp; lf_purgelocks(vp, &vp->v_lockf); return (0); } /* * vop_stdpathconf: * * Standard implementation of POSIX pathconf, to get information about limits * for a filesystem. * Override per filesystem for the case where the filesystem has smaller * limits. */ int vop_stdpathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_ASYNC_IO: *ap->a_retval = _POSIX_ASYNCHRONOUS_IO; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_CAP_PRESENT: case _PC_DEALLOC_PRESENT: case _PC_INF_PRESENT: case _PC_MAC_PRESENT: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Standard lock, unlock and islocked functions. */ int vop_stdlock(struct vop_lock1_args *ap) { struct vnode *vp = ap->a_vp; struct mtx *ilk; ilk = VI_MTX(vp); return (lockmgr_lock_flags(vp->v_vnlock, ap->a_flags, &ilk->lock_object, ap->a_file, ap->a_line)); } /* See above. */ int vop_stdunlock(struct vop_unlock_args *ap) { struct vnode *vp = ap->a_vp; return (lockmgr_unlock(vp->v_vnlock)); } /* See above. */ int vop_stdislocked(struct vop_islocked_args *ap) { return (lockstatus(ap->a_vp->v_vnlock)); } /* * Variants of the above set. * * Differences are: * - shared locking disablement is not supported * - v_vnlock pointer is not honored */ int vop_lock(struct vop_lock1_args *ap) { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; struct mtx *ilk; MPASS(vp->v_vnlock == &vp->v_lock); if (__predict_false((flags & ~(LK_TYPE_MASK | LK_NODDLKTREAT | LK_RETRY)) != 0)) goto other; switch (flags & LK_TYPE_MASK) { case LK_SHARED: return (lockmgr_slock(&vp->v_lock, flags, ap->a_file, ap->a_line)); case LK_EXCLUSIVE: return (lockmgr_xlock(&vp->v_lock, flags, ap->a_file, ap->a_line)); } other: ilk = VI_MTX(vp); return (lockmgr_lock_flags(&vp->v_lock, flags, &ilk->lock_object, ap->a_file, ap->a_line)); } int vop_unlock(struct vop_unlock_args *ap) { struct vnode *vp = ap->a_vp; MPASS(vp->v_vnlock == &vp->v_lock); return (lockmgr_unlock(&vp->v_lock)); } int vop_islocked(struct vop_islocked_args *ap) { struct vnode *vp = ap->a_vp; MPASS(vp->v_vnlock == &vp->v_lock); return (lockstatus(&vp->v_lock)); } /* * Return true for select/poll. */ int vop_nopoll(struct vop_poll_args *ap) { if (ap->a_events & ~POLLSTANDARD) return (POLLNVAL); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Implement poll for local filesystems that support it. */ int vop_stdpoll(struct vop_poll_args *ap) { if (ap->a_events & ~POLLSTANDARD) return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Return our mount point, as we will take charge of the writes. */ int vop_stdgetwritemount(struct vop_getwritemount_args *ap) { struct mount *mp; struct vnode *vp; /* * Note that having a reference does not prevent forced unmount from * setting ->v_mount to NULL after the lock gets released. This is of * no consequence for typical consumers (most notably vn_start_write) * since in this case the vnode is VIRF_DOOMED. Unmount might have * progressed far enough that its completion is only delayed by the * reference obtained here. The consumer only needs to concern itself * with releasing it. */ vp = ap->a_vp; mp = vfs_ref_from_vp(vp); *(ap->a_mpp) = mp; return (0); } /* * If the file system doesn't implement VOP_BMAP, then return sensible defaults: * - Return the vnode's bufobj instead of any underlying device's bufobj * - Calculate the physical block number as if there were equal size * consecutive blocks, but * - Report no contiguous runs of blocks. */ int vop_stdbmap(struct vop_bmap_args *ap) { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } int vop_stdfsync(struct vop_fsync_args *ap) { return (vn_fsync_buf(ap->a_vp, ap->a_waitfor)); } static int vop_stdfdatasync(struct vop_fdatasync_args *ap) { return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td)); } int vop_stdfdatasync_buf(struct vop_fdatasync_args *ap) { return (vn_fsync_buf(ap->a_vp, MNT_WAIT)); } /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ int vop_stdgetpages(struct vop_getpages_args *ap) { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL); } static int vop_stdgetpages_async(struct vop_getpages_async_args *ap) { int error; error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead); if (ap->a_iodone != NULL) ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } int vop_stdkqfilter(struct vop_kqfilter_args *ap) { return vfs_kqfilter(ap); } /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ int vop_stdputpages(struct vop_putpages_args *ap) { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } int vop_stdvptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } int vop_stdvptocnp(struct vop_vptocnp_args *ap) { struct vnode *const vp = ap->a_vp; struct vnode **const dvp = ap->a_vpp; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; char *dirbuf; int i = *buflen; int error = 0, covered = 0; int eofflag, flags, locked; size_t dirbuflen, len; off_t off; ino_t fileno; struct vattr va; struct nameidata nd; struct thread *const td = curthread; struct ucred *const cred = td->td_ucred; struct dirent *dp; struct vnode *mvp; if (vp->v_type != VDIR) return (ENOENT); error = VOP_GETATTR(vp, &va, cred); if (error) return (error); VREF(vp); locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, "..", vp); flags = FREAD; error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL); if (error) { vn_lock(vp, locked | LK_RETRY); return (error); } NDFREE_PNBUF(&nd); mvp = *dvp = nd.ni_vp; if (vp->v_mount != (*dvp)->v_mount && ((*dvp)->v_vflag & VV_ROOT) && ((*dvp)->v_mount->mnt_flag & MNT_UNION)) { *dvp = (*dvp)->v_mount->mnt_vnodecovered; VREF(mvp); VOP_UNLOCK(mvp); vn_close(mvp, FREAD, cred, td); VREF(*dvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); covered = 1; } fileno = va.va_fileid; dirbuflen = MAX(DEV_BSIZE, GENERIC_MAXDIRSIZ); if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); if ((*dvp)->v_type != VDIR) { error = ENOENT; goto out; } len = 0; off = 0; eofflag = 0; for (;;) { /* call VOP_READDIR of parent */ error = vn_dir_next_dirent(*dvp, td, dirbuf, dirbuflen, &dp, &len, &off, &eofflag); if (error != 0) goto out; if (len == 0) { error = ENOENT; goto out; } if ((dp->d_type != DT_WHT) && (dp->d_fileno == fileno)) { if (covered) { VOP_UNLOCK(*dvp); vn_lock(mvp, LK_SHARED | LK_RETRY); if (dirent_exists(mvp, dp->d_name, td) == 0) { error = ENOENT; VOP_UNLOCK(mvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); goto out; } VOP_UNLOCK(mvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); } i -= dp->d_namlen; if (i < 0) { error = ENOMEM; goto out; } if (dp->d_namlen == 1 && dp->d_name[0] == '.') { error = ENOENT; } else { bcopy(dp->d_name, buf + i, dp->d_namlen); error = 0; } goto out; } } out: free(dirbuf, M_TEMP); if (!error) { *buflen = i; vref(*dvp); } if (covered) { vput(*dvp); vrele(mvp); } else { VOP_UNLOCK(mvp); vn_close(mvp, FREAD, cred, td); } vn_lock(vp, locked | LK_RETRY); return (error); } int vop_stdallocate(struct vop_allocate_args *ap) { #ifdef __notyet__ struct statfs *sfs; off_t maxfilesize = 0; #endif struct iovec aiov; struct vattr vattr, *vap; struct uio auio; off_t fsize, len, cur, offset; uint8_t *buf; struct thread *td; struct vnode *vp; size_t iosize; int error; buf = NULL; error = 0; td = curthread; vap = &vattr; vp = ap->a_vp; len = *ap->a_len; offset = *ap->a_offset; error = VOP_GETATTR(vp, vap, ap->a_cred); if (error != 0) goto out; fsize = vap->va_size; iosize = vap->va_blocksize; if (iosize == 0) iosize = BLKDEV_IOSIZE; if (iosize > maxphys) iosize = maxphys; buf = malloc(iosize, M_TEMP, M_WAITOK); #ifdef __notyet__ /* * Check if the filesystem sets f_maxfilesize; if not use * VOP_SETATTR to perform the check. */ sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = VFS_STATFS(vp->v_mount, sfs, td); if (error == 0) maxfilesize = sfs->f_maxfilesize; free(sfs, M_STATFS); if (error != 0) goto out; if (maxfilesize) { if (offset > maxfilesize || len > maxfilesize || offset + len > maxfilesize) { error = EFBIG; goto out; } } else #endif if (offset + len > vap->va_size) { /* * Test offset + len against the filesystem's maxfilesize. */ VATTR_NULL(vap); vap->va_size = offset + len; error = VOP_SETATTR(vp, vap, ap->a_cred); if (error != 0) goto out; VATTR_NULL(vap); vap->va_size = fsize; error = VOP_SETATTR(vp, vap, ap->a_cred); if (error != 0) goto out; } for (;;) { /* * Read and write back anything below the nominal file * size. There's currently no way outside the filesystem * to know whether this area is sparse or not. */ cur = iosize; if ((offset % iosize) != 0) cur -= (offset % iosize); if (cur > len) cur = len; if (offset < fsize) { aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; error = VOP_READ(vp, &auio, ap->a_ioflag, ap->a_cred); if (error != 0) break; if (auio.uio_resid > 0) { bzero(buf + cur - auio.uio_resid, auio.uio_resid); } } else { bzero(buf, cur); } aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; error = VOP_WRITE(vp, &auio, ap->a_ioflag, ap->a_cred); if (error != 0) break; len -= cur; offset += cur; if (len == 0) break; if (should_yield()) break; } out: *ap->a_len = len; *ap->a_offset = offset; free(buf, M_TEMP); return (error); } static int vp_zerofill(struct vnode *vp, struct vattr *vap, off_t *offsetp, off_t *lenp, int ioflag, struct ucred *cred) { int iosize; int error = 0; struct iovec aiov; struct uio auio; struct thread *td; off_t offset, len; iosize = vap->va_blocksize; td = curthread; offset = *offsetp; len = *lenp; if (iosize == 0) iosize = BLKDEV_IOSIZE; /* If va_blocksize is 512 bytes, iosize will be 4 kilobytes */ iosize = min(iosize * 8, ZERO_REGION_SIZE); while (len > 0) { int xfersize = iosize; if (offset % iosize != 0) xfersize -= offset % iosize; if (xfersize > len) xfersize = len; aiov.iov_base = __DECONST(void *, zero_region); aiov.iov_len = xfersize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = xfersize; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; error = VOP_WRITE(vp, &auio, ioflag, cred); if (error != 0) { len -= xfersize - auio.uio_resid; offset += xfersize - auio.uio_resid; break; } len -= xfersize; offset += xfersize; } *offsetp = offset; *lenp = len; return (error); } int vop_stddeallocate(struct vop_deallocate_args *ap) { struct vnode *vp; off_t offset, len; struct ucred *cred; int error; struct vattr va; off_t noff, xfersize, rem; vp = ap->a_vp; offset = *ap->a_offset; cred = ap->a_cred; error = VOP_GETATTR(vp, &va, cred); if (error) return (error); len = omin((off_t)va.va_size - offset, *ap->a_len); while (len > 0) { noff = offset; error = vn_bmap_seekhole_locked(vp, FIOSEEKDATA, &noff, cred); if (error) { if (error != ENXIO) /* XXX: Is it okay to fallback further? */ goto out; /* * No more data region to be filled */ offset += len; len = 0; error = 0; break; } KASSERT(noff >= offset, ("FIOSEEKDATA going backward")); if (noff != offset) { xfersize = omin(noff - offset, len); len -= xfersize; offset += xfersize; if (len == 0) break; } error = vn_bmap_seekhole_locked(vp, FIOSEEKHOLE, &noff, cred); if (error) goto out; /* Fill zeroes */ xfersize = rem = omin(noff - offset, len); error = vp_zerofill(vp, &va, &offset, &rem, ap->a_ioflag, cred); if (error) { len -= xfersize - rem; goto out; } len -= xfersize; if (should_yield()) break; } /* Handle the case when offset is beyond EOF */ if (len < 0) len = 0; out: *ap->a_offset = offset; *ap->a_len = len; return (error); } int vop_stdadvise(struct vop_advise_args *ap) { struct vnode *vp; struct bufobj *bo; daddr_t startn, endn; off_t bstart, bend, start, end; int bsize, error; vp = ap->a_vp; switch (ap->a_advice) { case POSIX_FADV_WILLNEED: /* * Do nothing for now. Filesystems should provide a * custom method which starts an asynchronous read of * the requested region. */ error = 0; break; case POSIX_FADV_DONTNEED: error = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); break; } /* * Round to block boundaries (and later possibly further to * page boundaries). Applications cannot reasonably be aware * of the boundaries, and the rounding must be to expand at * both extremities to cover enough. It still doesn't cover * read-ahead. For partial blocks, this gives unnecessary * discarding of buffers but is efficient enough since the * pages usually remain in VMIO for some time. */ bsize = vp->v_bufobj.bo_bsize; bstart = rounddown(ap->a_start, bsize); bend = roundup(ap->a_end, bsize); /* * Deactivate pages in the specified range from the backing VM * object. Pages that are resident in the buffer cache will * remain wired until their corresponding buffers are released * below. */ if (vp->v_object != NULL) { start = trunc_page(bstart); end = round_page(bend); VM_OBJECT_RLOCK(vp->v_object); vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), OFF_TO_IDX(end)); VM_OBJECT_RUNLOCK(vp->v_object); } bo = &vp->v_bufobj; BO_RLOCK(bo); startn = bstart / bsize; endn = bend / bsize; error = bnoreuselist(&bo->bo_clean, bo, startn, endn); if (error == 0) error = bnoreuselist(&bo->bo_dirty, bo, startn, endn); BO_RUNLOCK(bo); VOP_UNLOCK(vp); break; default: error = EINVAL; break; } return (error); } int vop_stdunp_bind(struct vop_unp_bind_args *ap) { ap->a_vp->v_unpcb = ap->a_unpcb; return (0); } int vop_stdunp_connect(struct vop_unp_connect_args *ap) { *ap->a_unpcb = ap->a_vp->v_unpcb; return (0); } int vop_stdunp_detach(struct vop_unp_detach_args *ap) { ap->a_vp->v_unpcb = NULL; return (0); } static int vop_stdis_text(struct vop_is_text_args *ap) { return (atomic_load_int(&ap->a_vp->v_writecount) < 0); } int vop_stdset_text(struct vop_set_text_args *ap) { struct vnode *vp; int n; bool gotref; vp = ap->a_vp; n = atomic_load_int(&vp->v_writecount); for (;;) { if (__predict_false(n > 0)) { return (ETXTBSY); } /* * Transition point, we may need to grab a reference on the vnode. * * Take the ref early As a safety measure against bogus calls * to vop_stdunset_text. */ if (n == 0) { gotref = false; if ((vn_irflag_read(vp) & VIRF_TEXT_REF) != 0) { vref(vp); gotref = true; } if (atomic_fcmpset_int(&vp->v_writecount, &n, -1)) { return (0); } if (gotref) { vunref(vp); } continue; } MPASS(n < 0); if (atomic_fcmpset_int(&vp->v_writecount, &n, n - 1)) { return (0); } } __assert_unreachable(); } static int vop_stdunset_text(struct vop_unset_text_args *ap) { struct vnode *vp; int n; vp = ap->a_vp; n = atomic_load_int(&vp->v_writecount); for (;;) { if (__predict_false(n >= 0)) { return (EINVAL); } /* * Transition point, we may need to release a reference on the vnode. */ if (n == -1) { if (atomic_fcmpset_int(&vp->v_writecount, &n, 0)) { if ((vn_irflag_read(vp) & VIRF_TEXT_REF) != 0) { vunref(vp); } return (0); } continue; } MPASS(n < -1); if (atomic_fcmpset_int(&vp->v_writecount, &n, n + 1)) { return (0); } } __assert_unreachable(); } static int __always_inline vop_stdadd_writecount_impl(struct vop_add_writecount_args *ap, bool handle_msync) { struct vnode *vp; struct mount *mp __diagused; int n; vp = ap->a_vp; #ifdef INVARIANTS mp = vp->v_mount; if (mp != NULL) { if (handle_msync) { VNPASS((mp->mnt_kern_flag & MNTK_NOMSYNC) == 0, vp); } else { VNPASS((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0, vp); } } #endif n = atomic_load_int(&vp->v_writecount); for (;;) { if (__predict_false(n < 0)) { return (ETXTBSY); } VNASSERT(n + ap->a_inc >= 0, vp, ("neg writecount increment %d + %d = %d", n, ap->a_inc, n + ap->a_inc)); if (n == 0) { if (handle_msync) { vlazy(vp); } } if (atomic_fcmpset_int(&vp->v_writecount, &n, n + ap->a_inc)) { return (0); } } __assert_unreachable(); } int vop_stdadd_writecount(struct vop_add_writecount_args *ap) { return (vop_stdadd_writecount_impl(ap, true)); } int vop_stdadd_writecount_nomsync(struct vop_add_writecount_args *ap) { return (vop_stdadd_writecount_impl(ap, false)); } int vop_stdneed_inactive(struct vop_need_inactive_args *ap) { return (1); } int vop_stdioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct vattr va; off_t *offp; int error; switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: vp = ap->a_vp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (EBADF); if (vp->v_type == VREG) error = VOP_GETATTR(vp, &va, ap->a_cred); else error = ENOTTY; if (error == 0) { offp = ap->a_data; if (*offp < 0 || *offp >= va.va_size) error = ENXIO; else if (ap->a_command == FIOSEEKHOLE) *offp = va.va_size; } VOP_UNLOCK(vp); break; default: error = ENOTTY; break; } return (error); } /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. */ int vfs_stdroot(struct mount *mp, int flags, struct vnode **vpp) { return (EOPNOTSUPP); } int vfs_stdstatfs(struct mount *mp, struct statfs *sbp) { return (EOPNOTSUPP); } int vfs_stdquotactl(struct mount *mp, int cmds, uid_t uid, void *arg, bool *mp_busy) { return (EOPNOTSUPP); } int vfs_stdsync(struct mount *mp, int waitfor) { struct vnode *vp, *mvp; struct thread *td; int error, lockreq, allerror = 0; td = curthread; lockreq = LK_EXCLUSIVE | LK_INTERLOCK; if (waitfor != MNT_WAIT) lockreq |= LK_NOWAIT; /* * Force stale buffer cache information to be flushed. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; vput(vp); } return (allerror); } int vfs_stdnosync(struct mount *mp, int waitfor) { return (0); } static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap) { int error; error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } int vfs_stdvget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { return (EOPNOTSUPP); } int vfs_stdfhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { return (EOPNOTSUPP); } int vfs_stdinit(struct vfsconf *vfsp) { return (0); } int vfs_stduninit(struct vfsconf *vfsp) { return(0); } int vfs_stdextattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { if (filename_vp != NULL) VOP_UNLOCK(filename_vp); return (EOPNOTSUPP); } int vfs_stdsysctl(struct mount *mp, fsctlop_t op, struct sysctl_req *req) { return (EOPNOTSUPP); } static vop_bypass_t * bp_by_off(struct vop_vector *vop, struct vop_generic_args *a) { return (*(vop_bypass_t **)((char *)vop + a->a_desc->vdesc_vop_offset)); } int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a) { vop_bypass_t *bp; int prev_stops, rc; bp = bp_by_off(vop, a); MPASS(bp != NULL); prev_stops = sigdeferstop(SIGDEFERSTOP_SILENT); rc = bp(a); sigallowstop(prev_stops); return (rc); } static int vop_stdstat(struct vop_stat_args *a) { struct vattr vattr; struct vattr *vap; struct vnode *vp; struct stat *sb; int error; u_short mode; vp = a->a_vp; sb = a->a_sb; error = vop_stat_helper_pre(a); if (error != 0) return (error); vap = &vattr; /* * Initialize defaults for new and unusual fields, so that file * systems which don't support these fields don't need to know * about them. */ vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; vap->va_fsid = VNOVAL; vap->va_gen = 0; vap->va_rdev = NODEV; error = VOP_GETATTR(vp, vap, a->a_active_cred); if (error) goto out; /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: error = EBADF; goto out; } sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) { error = EOVERFLOW; goto out; } sb->st_size = vap->va_size; sb->st_atim.tv_sec = vap->va_atime.tv_sec; sb->st_atim.tv_nsec = vap->va_atime.tv_nsec; sb->st_mtim.tv_sec = vap->va_mtime.tv_sec; sb->st_mtim.tv_nsec = vap->va_mtime.tv_nsec; sb->st_ctim.tv_sec = vap->va_ctime.tv_sec; sb->st_ctim.tv_nsec = vap->va_ctime.tv_nsec; sb->st_birthtim.tv_sec = vap->va_birthtime.tv_sec; sb->st_birthtim.tv_nsec = vap->va_birthtime.tv_nsec; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Use minimum/default of PAGE_SIZE (e.g. for VCHR). */ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); sb->st_flags = vap->va_flags; sb->st_blocks = vap->va_bytes / S_BLKSIZE; sb->st_gen = vap->va_gen; out: return (vop_stat_helper_post(a, error)); } static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap __unused) { return (EJUSTRETURN); } static int vop_stdvput_pair(struct vop_vput_pair_args *ap) { struct vnode *dvp, *vp, **vpp; dvp = ap->a_dvp; vpp = ap->a_vpp; vput(dvp); if (vpp != NULL && ap->a_unlock_vp && (vp = *vpp) != NULL) vput(vp); return (0); } + +static int +vop_stdgetlowvnode(struct vop_getlowvnode_args *ap) +{ + vref(ap->a_vp); + *ap->a_vplp = ap->a_vp; + return (0); +} diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index c4051fdfe327..a2b6a7c8ff9f 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -1,840 +1,847 @@ #- # Copyright (c) 1992, 1993 # The Regents of the University of California. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the name of the University nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # Above each of the vop descriptors in lines starting with %% # is a specification of the locking protocol used by each vop call. # The first column is the name of the variable, the remaining three # columns are in, out and error respectively. The "in" column defines # the lock state on input, the "out" column defines the state on successful # return, and the "error" column defines the locking state on error exit. # # The locking value can take the following values: # L: locked; not converted to type of lock. # E: locked with exclusive lock for this process. # U: unlocked. # -: not applicable. vnode does not yet (or no longer) exists. # =: the same on input and output, may be either L or U. # # The parameter named "vpp" is assumed to be always used with double # indirection (**vpp) and that name is hard-coded in vnode_if.awk ! # # Lines starting with %! specify a pre or post-condition function # to call before/after the vop call. # # If other such parameters are introduced, they have to be added to # the AWK script at the head of the definition of "add_debug_code()". # vop_islocked { IN struct vnode *vp; }; %% lookup dvp L L L %% lookup vpp - L - # XXX - the lookup locking protocol defies simple description and depends # on the flags and operation fields in the (cnp) structure. Note # especially that *vpp may equal dvp and both may be locked. vop_lookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% cachedlookup dvp L L L %% cachedlookup vpp - L - # This must be an exact copy of lookup. See kern/vfs_cache.c for details. vop_cachedlookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% create dvp E E E %% create vpp - L - %! create pre vop_create_pre %! create post vop_create_post vop_create { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% whiteout dvp E E E %! whiteout pre vop_whiteout_pre %! whiteout post vop_whiteout_post vop_whiteout { IN struct vnode *dvp; IN struct componentname *cnp; IN int flags; }; %% mknod dvp E E E %% mknod vpp - L - %! mknod pre vop_mknod_pre %! mknod post vop_mknod_post vop_mknod { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% open vp L L L %! open post vop_open_post vop_open { IN struct vnode *vp; IN int mode; IN struct ucred *cred; IN struct thread *td; IN struct file *fp; }; %% close vp L L L %! close post vop_close_post vop_close { IN struct vnode *vp; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% fplookup_vexec vp - - - %! fplookup_vexec debugpre vop_fplookup_vexec_debugpre %! fplookup_vexec debugpost vop_fplookup_vexec_debugpost vop_fplookup_vexec { IN struct vnode *vp; IN struct ucred *cred; }; %% fplookup_symlink vp - - - %! fplookup_symlink debugpre vop_fplookup_symlink_debugpre %! fplookup_symlink debugpost vop_fplookup_symlink_debugpost vop_fplookup_symlink { IN struct vnode *vp; IN struct cache_fpl *fpl; }; %% access vp L L L vop_access { IN struct vnode *vp; IN accmode_t accmode; IN struct ucred *cred; IN struct thread *td; }; %% accessx vp L L L vop_accessx { IN struct vnode *vp; IN accmode_t accmode; IN struct ucred *cred; IN struct thread *td; }; %% stat vp L L L vop_stat { IN struct vnode *vp; OUT struct stat *sb; IN struct ucred *active_cred; IN struct ucred *file_cred; }; %% getattr vp L L L vop_getattr { IN struct vnode *vp; OUT struct vattr *vap; IN struct ucred *cred; }; %% setattr vp E E E %! setattr pre vop_setattr_pre %! setattr post vop_setattr_post vop_setattr { IN struct vnode *vp; IN struct vattr *vap; IN struct ucred *cred; }; %% mmapped vp L L L vop_mmapped { IN struct vnode *vp; }; %% read vp L L L %! read post vop_read_post vop_read { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% read_pgcache vp - - - %! read_pgcache post vop_read_pgcache_post vop_read_pgcache { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% write vp L L L %! write pre VOP_WRITE_PRE %! write post VOP_WRITE_POST vop_write { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% ioctl vp U U U vop_ioctl { IN struct vnode *vp; IN u_long command; IN void *data; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% poll vp U U U vop_poll { IN struct vnode *vp; IN int events; IN struct ucred *cred; IN struct thread *td; }; %% kqfilter vp U U U vop_kqfilter { IN struct vnode *vp; IN struct knote *kn; }; %% revoke vp L L L vop_revoke { IN struct vnode *vp; IN int flags; }; %% fsync vp - - - %! fsync pre vop_fsync_debugpre %! fsync post vop_fsync_debugpost vop_fsync { IN struct vnode *vp; IN int waitfor; IN struct thread *td; }; %% remove dvp E E E %% remove vp E E E %! remove pre vop_remove_pre %! remove post vop_remove_post vop_remove { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% link tdvp E E E %% link vp E E E %! link pre vop_link_pre %! link post vop_link_post vop_link { IN struct vnode *tdvp; IN struct vnode *vp; IN struct componentname *cnp; }; %! rename pre vop_rename_pre %! rename post vop_rename_post vop_rename { IN WILLRELE struct vnode *fdvp; IN WILLRELE struct vnode *fvp; IN struct componentname *fcnp; IN WILLRELE struct vnode *tdvp; IN WILLRELE struct vnode *tvp; IN struct componentname *tcnp; }; %% mkdir dvp E E E %% mkdir vpp - E - %! mkdir pre vop_mkdir_pre %! mkdir post vop_mkdir_post %! mkdir debugpost vop_mkdir_debugpost vop_mkdir { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% rmdir dvp E E E %% rmdir vp E E E %! rmdir pre vop_rmdir_pre %! rmdir post vop_rmdir_post vop_rmdir { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% symlink dvp E E E %% symlink vpp - E - %! symlink pre vop_symlink_pre %! symlink post vop_symlink_post vop_symlink { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; IN const char *target; }; %% readdir vp L L L %! readdir post vop_readdir_post vop_readdir { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; INOUT int *eofflag; OUT int *ncookies; INOUT uint64_t **cookies; }; %% readlink vp L L L vop_readlink { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; }; %% inactive vp E E E vop_inactive { IN struct vnode *vp; }; %! need_inactive debugpre vop_need_inactive_debugpre %! need_inactive debugpost vop_need_inactive_debugpost vop_need_inactive { IN struct vnode *vp; }; %% reclaim vp E E E %! reclaim post vop_reclaim_post vop_reclaim { IN struct vnode *vp; }; %! lock1 debugpre vop_lock_debugpre %! lock1 debugpost vop_lock_debugpost vop_lock1 { IN struct vnode *vp; IN int flags; IN const char *file; IN int line; }; %! unlock debugpre vop_unlock_debugpre vop_unlock { IN struct vnode *vp; }; %% bmap vp L L L vop_bmap { IN struct vnode *vp; IN daddr_t bn; OUT struct bufobj **bop; IN daddr_t *bnp; OUT int *runp; OUT int *runb; }; %% strategy vp L L L %! strategy debugpre vop_strategy_debugpre vop_strategy { IN struct vnode *vp; IN struct buf *bp; }; %% getwritemount vp = = = vop_getwritemount { IN struct vnode *vp; OUT struct mount **mpp; }; +%% getwritevnode vp = = = + +vop_getlowvnode { + IN struct vnode *vp; + OUT struct vnode **vplp; + IN int flags; +}; %% print vp - - - vop_print { IN struct vnode *vp; }; %% pathconf vp L L L vop_pathconf { IN struct vnode *vp; IN int name; OUT long *retval; }; %% advlock vp U U U vop_advlock { IN struct vnode *vp; IN void *id; IN int op; IN struct flock *fl; IN int flags; }; %% advlockasync vp U U U vop_advlockasync { IN struct vnode *vp; IN void *id; IN int op; IN struct flock *fl; IN int flags; IN struct task *task; INOUT void **cookiep; }; %% advlockpurge vp E E E vop_advlockpurge { IN struct vnode *vp; }; %% reallocblks vp E E E vop_reallocblks { IN struct vnode *vp; IN struct cluster_save *buflist; }; %% getpages vp L L L vop_getpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int *rbehind; IN int *rahead; }; %% getpages_async vp L L L vop_getpages_async { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int *rbehind; IN int *rahead; IN vop_getpages_iodone_t *iodone; IN void *arg; }; %% putpages vp L L L vop_putpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int sync; IN int *rtvals; }; %% getacl vp L L L vop_getacl { IN struct vnode *vp; IN acl_type_t type; OUT struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% setacl vp E E E %! setacl pre vop_setacl_pre %! setacl post vop_setacl_post vop_setacl { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% aclcheck vp = = = vop_aclcheck { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% closeextattr vp L L L vop_closeextattr { IN struct vnode *vp; IN int commit; IN struct ucred *cred; IN struct thread *td; }; %% getextattr vp L L L vop_getextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% listextattr vp L L L vop_listextattr { IN struct vnode *vp; IN int attrnamespace; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% openextattr vp L L L vop_openextattr { IN struct vnode *vp; IN struct ucred *cred; IN struct thread *td; }; %% deleteextattr vp E E E %! deleteextattr pre vop_deleteextattr_pre %! deleteextattr post vop_deleteextattr_post vop_deleteextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; IN struct ucred *cred; IN struct thread *td; }; %% setextattr vp E E E %! setextattr pre vop_setextattr_pre %! setextattr post vop_setextattr_post vop_setextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; IN struct ucred *cred; IN struct thread *td; }; %% setlabel vp E E E vop_setlabel { IN struct vnode *vp; IN struct label *label; IN struct ucred *cred; IN struct thread *td; }; %% vptofh vp = = = vop_vptofh { IN struct vnode *vp; IN struct fid *fhp; }; %% vptocnp vp L L L %% vptocnp vpp - U - vop_vptocnp { IN struct vnode *vp; OUT struct vnode **vpp; INOUT char *buf; INOUT size_t *buflen; }; %% allocate vp E E E vop_allocate { IN struct vnode *vp; INOUT off_t *offset; INOUT off_t *len; IN int ioflag; IN struct ucred *cred; }; %% advise vp U U U vop_advise { IN struct vnode *vp; IN off_t start; IN off_t end; IN int advice; }; %% unp_bind vp E E E vop_unp_bind { IN struct vnode *vp; IN struct unpcb *unpcb; }; %% unp_connect vp L L L vop_unp_connect { IN struct vnode *vp; OUT struct unpcb **unpcb; }; %% unp_detach vp = = = vop_unp_detach { IN struct vnode *vp; }; %% is_text vp L L L vop_is_text { IN struct vnode *vp; }; %% set_text vp = = = vop_set_text { IN struct vnode *vp; }; %% vop_unset_text vp L L L vop_unset_text { IN struct vnode *vp; }; %% add_writecount vp L L L vop_add_writecount { IN struct vnode *vp; IN int inc; }; %% fdatasync vp - - - %! fdatasync pre vop_fdatasync_debugpre %! fdatasync post vop_fdatasync_debugpost vop_fdatasync { IN struct vnode *vp; IN struct thread *td; }; %% copy_file_range invp U U U %% copy_file_range outvp U U U vop_copy_file_range { IN struct vnode *invp; INOUT off_t *inoffp; IN struct vnode *outvp; INOUT off_t *outoffp; INOUT size_t *lenp; IN unsigned int flags; IN struct ucred *incred; IN struct ucred *outcred; IN struct thread *fsizetd; }; %% vput_pair dvp E - - vop_vput_pair { IN struct vnode *dvp; INOUT struct vnode **vpp; IN bool unlock_vp; }; %% deallocate vp L L L vop_deallocate { IN struct vnode *vp; INOUT off_t *offset; INOUT off_t *len; IN int flags; IN int ioflag; IN struct ucred *cred; }; # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, # the new VOP should replace one of the spares. vop_spare1 { IN struct vnode *vp; }; vop_spare2 { IN struct vnode *vp; }; vop_spare3 { IN struct vnode *vp; }; vop_spare4 { IN struct vnode *vp; }; vop_spare5 { IN struct vnode *vp; };