Index: head/sys/conf/files =================================================================== --- head/sys/conf/files +++ head/sys/conf/files @@ -3479,6 +3479,7 @@ fs/fuse/fuse_node.c optional fusefs fs/fuse/fuse_vfsops.c optional fusefs fs/fuse/fuse_vnops.c optional fusefs +fs/mntfs/mntfs_vnops.c standard fs/msdosfs/msdosfs_conv.c optional msdosfs fs/msdosfs/msdosfs_denode.c optional msdosfs fs/msdosfs/msdosfs_fat.c optional msdosfs Index: head/sys/fs/mntfs/mntfs_vnops.c =================================================================== --- head/sys/fs/mntfs/mntfs_vnops.c +++ head/sys/fs/mntfs/mntfs_vnops.c @@ -0,0 +1,95 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Netflix, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include + +/* + * The "mntfs" VCHR vnodes implemented here provide a safe way for file systems + * to access their disk devices. Using the normal devfs vnode has the problem + * that if the device disappears, the devfs vnode is vgone'd as part of + * removing it from the application-visible namespace, and some file systems + * (notably FFS with softdep) get very unhappy if their dirty buffers are + * invalidated out from under them. By using a separate, private vnode, + * file systems are able to clean up their buffer state in a controlled fashion + * when the underlying device disappears. + */ + +static int +mntfs_reclaim(struct vop_reclaim_args *ap) +{ + struct vnode *vp = ap->a_vp; + + dev_rel(vp->v_rdev); + return (0); +} + +struct vop_vector mntfs_vnodeops = { + .vop_default = &default_vnodeops, + + .vop_fsync = vop_stdfsync, + .vop_strategy = VOP_PANIC, + .vop_reclaim = mntfs_reclaim, +}; +VFS_VOP_VECTOR_REGISTER(mntfs_vnodeops); + +/* + * Allocate a private VCHR vnode for use by a mounted fs. + * The underlying device will be the same as for the given vnode. + * This mntfs vnode must be freed with mntfs_freevp() rather than just + * releasing the reference. + */ +struct vnode * +mntfs_allocvp(struct mount *mp, struct vnode *ovp) +{ + struct vnode *vp; + struct cdev *dev; + + ASSERT_VOP_ELOCKED(ovp, __func__); + + dev = ovp->v_rdev; + + getnewvnode("mntfs", mp, &mntfs_vnodeops, &vp); + vp->v_type = VCHR; + vp->v_data = NULL; + dev_ref(dev); + vp->v_rdev = dev; + + return (vp); +} + +void +mntfs_freevp(struct vnode *vp) +{ + + vgone(vp); + vrele(vp); +} Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c +++ head/sys/kern/vfs_subr.c @@ -2289,6 +2289,8 @@ int error; ASSERT_BO_WLOCKED(bo); + KASSERT((bo->bo_flag & BO_NOBUFS) == 0, + ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, Index: head/sys/sys/bufobj.h =================================================================== --- head/sys/sys/bufobj.h +++ head/sys/sys/bufobj.h @@ -117,6 +117,7 @@ #define BO_ONWORKLST (1 << 0) /* On syncer work-list */ #define BO_WWAIT (1 << 1) /* Wait for output to complete */ #define BO_DEAD (1 << 2) /* Dead; only with INVARIANTS */ +#define BO_NOBUFS (1 << 3) /* No bufs allowed */ #define BO_LOCKPTR(bo) (&(bo)->bo_lock) #define BO_LOCK(bo) rw_wlock(BO_LOCKPTR((bo))) Index: head/sys/sys/mount.h =================================================================== --- head/sys/sys/mount.h +++ head/sys/sys/mount.h @@ -940,6 +940,8 @@ #define vfsconf_unlock() sx_xunlock(&vfsconf_sx) #define vfsconf_slock() sx_slock(&vfsconf_sx) #define vfsconf_sunlock() sx_sunlock(&vfsconf_sx) +struct vnode *mntfs_allocvp(struct mount *, struct vnode *); +void mntfs_freevp(struct vnode *); /* * Declarations for these vfs default operations are located in Index: head/sys/ufs/ffs/ffs_alloc.c =================================================================== --- head/sys/ufs/ffs/ffs_alloc.c +++ head/sys/ufs/ffs/ffs_alloc.c @@ -3594,6 +3594,7 @@ struct inode *ip; struct buf *bp; struct fs *fs; + struct ufsmount *ump; struct filedesc *fdp; int error; daddr_t lbn; @@ -3622,10 +3623,12 @@ return (EINVAL); } ip = VTOI(vp); - if (ITODEVVP(ip) != devvp) { + ump = ip->i_ump; + if (ump->um_odevvp != devvp) { vput(vp); return (EINVAL); } + devvp = ump->um_devvp; fs = ITOFS(ip); vput(vp); foffset_lock_uio(fp, uio, flags); Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c +++ head/sys/ufs/ffs/ffs_vfsops.c @@ -151,7 +151,7 @@ static int ffs_mount(struct mount *mp) { - struct vnode *devvp; + struct vnode *devvp, *odevvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; @@ -246,6 +246,7 @@ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; + odevvp = ump->um_odevvp; devvp = ump->um_devvp; if (fsckpid == -1 && ump->um_fsckpid > 0) { if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || @@ -337,16 +338,15 @@ * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ - vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); - error = VOP_ACCESS(devvp, VREAD | VWRITE, + vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_ACCESS(odevvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); + VOP_UNLOCK(odevvp); if (error) { - VOP_UNLOCK(devvp); return (error); } - VOP_UNLOCK(devvp); fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; @@ -782,8 +782,8 @@ * Common code for mount and mountroot */ static int -ffs_mountfs(devvp, mp, td) - struct vnode *devvp; +ffs_mountfs(odevvp, mp, td) + struct vnode *odevvp; struct mount *mp; struct thread *td; { @@ -794,6 +794,7 @@ struct ucred *cred; struct g_consumer *cp; struct mount *nmp; + struct vnode *devvp; int candelete, canspeedup; off_t loc; @@ -802,11 +803,13 @@ cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + devvp = mntfs_allocvp(mp, odevvp); + VOP_UNLOCK(odevvp); KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { - VOP_UNLOCK(devvp); + mntfs_freevp(devvp); return (EBUSY); } g_topology_lock(); @@ -814,12 +817,14 @@ g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); - VOP_UNLOCK(devvp); + mntfs_freevp(devvp); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; - VOP_UNLOCK(devvp); + BO_LOCK(&odevvp->v_bufobj); + odevvp->v_bufobj.bo_flag |= BO_NOBUFS; + BO_UNLOCK(&odevvp->v_bufobj); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) @@ -1020,6 +1025,7 @@ ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; + ump->um_odevvp = odevvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; @@ -1099,7 +1105,11 @@ free(ump, M_UFSMNT); mp->mnt_data = NULL; } + BO_LOCK(&odevvp->v_bufobj); + odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; + BO_UNLOCK(&odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); + mntfs_freevp(devvp); dev_rel(dev); return (error); } @@ -1304,8 +1314,12 @@ } g_vfs_close(ump->um_cp); g_topology_unlock(); + BO_LOCK(&ump->um_odevvp->v_bufobj); + ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; + BO_UNLOCK(&ump->um_odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); - vrele(ump->um_devvp); + mntfs_freevp(ump->um_devvp); + vrele(ump->um_odevvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { @@ -2293,7 +2307,19 @@ struct buf *tbp; int error, nocopy; + /* + * This is the bufobj strategy for the private VCHR vnodes + * used by FFS to access the underlying storage device. + * We override the default bufobj strategy and thus bypass + * VOP_STRATEGY() for these vnodes. + */ vp = bo2vnode(bo); + KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR || + bp->b_vp->v_rdev == NULL || + bp->b_vp->v_rdev->si_mountpt == NULL || + VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL || + vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp, + ("ffs_geom_strategy() with wrong vp")); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && Index: head/sys/ufs/ufs/ufsmount.h =================================================================== --- head/sys/ufs/ufs/ufsmount.h +++ head/sys/ufs/ufs/ufsmount.h @@ -83,7 +83,8 @@ struct cdev *um_dev; /* (r) device mounted */ struct g_consumer *um_cp; /* (r) GEOM access point */ struct bufobj *um_bo; /* (r) Buffer cache object */ - struct vnode *um_devvp; /* (r) blk dev mounted vnode */ + struct vnode *um_odevvp; /* (r) devfs dev vnode */ + struct vnode *um_devvp; /* (r) mntfs private vnode */ u_long um_fstype; /* (c) type of filesystem */ struct fs *um_fs; /* (r) pointer to superblock */ struct ufs_extattr_per_mount um_extattr; /* (c) extended attrs */