diff --git a/sys/fs/cd9660/cd9660_vfsops.c b/sys/fs/cd9660/cd9660_vfsops.c index 21d3c3e13a8f..5d475bec93b8 100644 --- a/sys/fs/cd9660/cd9660_vfsops.c +++ b/sys/fs/cd9660/cd9660_vfsops.c @@ -1,852 +1,851 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ISOFSMNT, "isofs_mount", "ISOFS mount structure"); MALLOC_DEFINE(M_ISOFSNODE, "isofs_node", "ISOFS vnode private part"); struct iconv_functions *cd9660_iconv = NULL; static vfs_mount_t cd9660_mount; static vfs_cmount_t cd9660_cmount; static vfs_unmount_t cd9660_unmount; static vfs_root_t cd9660_root; static vfs_statfs_t cd9660_statfs; static vfs_vget_t cd9660_vget; static vfs_fhtovp_t cd9660_fhtovp; static struct vfsops cd9660_vfsops = { .vfs_fhtovp = cd9660_fhtovp, .vfs_mount = cd9660_mount, .vfs_cmount = cd9660_cmount, .vfs_root = cd9660_root, .vfs_statfs = cd9660_statfs, .vfs_unmount = cd9660_unmount, .vfs_vget = cd9660_vget, }; VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY); MODULE_VERSION(cd9660, 1); static int cd9660_vfs_hash_cmp(struct vnode *vp, void *pino); static int iso_mountfs(struct vnode *devvp, struct mount *mp); /* * VFS Operations. */ static int cd9660_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct iso_args args; int error; error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); ma = mount_argsu(ma, "cs_disk", args.cs_disk, 64); ma = mount_argsu(ma, "cs_local", args.cs_local, 64); ma = mount_argf(ma, "ssector", "%u", args.ssector); ma = mount_argb(ma, !(args.flags & ISOFSMNT_NORRIP), "norrip"); ma = mount_argb(ma, args.flags & ISOFSMNT_GENS, "nogens"); ma = mount_argb(ma, args.flags & ISOFSMNT_EXTATT, "noextatt"); ma = mount_argb(ma, !(args.flags & ISOFSMNT_NOJOLIET), "nojoliet"); ma = mount_argb(ma, args.flags & ISOFSMNT_BROKENJOLIET, "nobrokenjoliet"); ma = mount_argb(ma, args.flags & ISOFSMNT_KICONV, "nokiconv"); error = kernel_mount(ma, flags); return (error); } static int cd9660_mount(struct mount *mp) { struct vnode *devvp; struct thread *td; char *fspec; int error; accmode_t accmode; struct nameidata ndp; struct iso_mnt *imp = NULL; td = curthread; /* * Unconditionally mount as read-only. */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); imp = VFSTOISOFS(mp); if (mp->mnt_flag & MNT_UPDATE) { if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) return (0); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp))) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* * Verify that user has necessary permissions on the device, * or has superuser abilities */ accmode = VREAD; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = iso_mountfs(devvp, mp); if (error) vrele(devvp); } else { if (devvp != imp->im_devvp) error = EINVAL; /* needs translation */ vput(devvp); } if (error) return (error); vfs_mountedfrom(mp, fspec); return (0); } /* * Common code for mount and mountroot */ static int iso_mountfs(devvp, mp) struct vnode *devvp; struct mount *mp; { struct iso_mnt *isomp = NULL; struct buf *bp = NULL; struct buf *pribp = NULL, *supbp = NULL; struct cdev *dev; int error = EINVAL; int high_sierra = 0; int iso_bsize; int iso_blknum; int joliet_level; int isverified = 0; struct iso_volume_descriptor *vdp = NULL; struct iso_primary_descriptor *pri = NULL; struct iso_sierra_primary_descriptor *pri_sierra = NULL; struct iso_supplementary_descriptor *sup = NULL; struct iso_directory_record *rootp; int logical_block_size, ssector; struct g_consumer *cp; struct bufobj *bo; char *cs_local, *cs_disk; dev = devvp->v_rdev; dev_ref(dev); g_topology_lock(); error = g_vfs_open(devvp, &cp, "cd9660", 0); if (error == 0) g_getattr("MNT::verified", cp, &isverified); g_topology_unlock(); VOP_UNLOCK(devvp); if (error) goto out; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > maxphys) mp->mnt_iosize_max = maxphys; bo = &devvp->v_bufobj; /* This is the "logical sector size". The standard says this * should be 2048 or the physical sector size on the device, * whichever is greater. */ if ((ISO_DEFAULT_BLOCK_SIZE % cp->provider->sectorsize) != 0) { error = EINVAL; goto out; } iso_bsize = cp->provider->sectorsize; joliet_level = 0; if (1 != vfs_scanopt(mp->mnt_optnew, "ssector", "%d", &ssector)) ssector = 0; for (iso_blknum = 16 + ssector; iso_blknum < 100 + ssector; iso_blknum++) { if ((error = bread(devvp, iso_blknum * btodb(ISO_DEFAULT_BLOCK_SIZE), iso_bsize, NOCRED, &bp)) != 0) goto out; vdp = (struct iso_volume_descriptor *)bp->b_data; if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) { if (bcmp (vdp->id_sierra, ISO_SIERRA_ID, sizeof vdp->id_sierra) != 0) { error = EINVAL; goto out; } else high_sierra = 1; } switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){ case ISO_VD_PRIMARY: if (pribp == NULL) { pribp = bp; bp = NULL; pri = (struct iso_primary_descriptor *)vdp; pri_sierra = (struct iso_sierra_primary_descriptor *)vdp; } break; case ISO_VD_SUPPLEMENTARY: if (supbp == NULL) { supbp = bp; bp = NULL; sup = (struct iso_supplementary_descriptor *)vdp; if (!vfs_flagopt(mp->mnt_optnew, "nojoliet", NULL, 0)) { if (bcmp(sup->escape, "%/@", 3) == 0) joliet_level = 1; if (bcmp(sup->escape, "%/C", 3) == 0) joliet_level = 2; if (bcmp(sup->escape, "%/E", 3) == 0) joliet_level = 3; if ((isonum_711 (sup->flags) & 1) && !vfs_flagopt(mp->mnt_optnew, "brokenjoliet", NULL, 0)) joliet_level = 0; } } break; case ISO_VD_END: goto vd_end; default: break; } if (bp != NULL) { brelse(bp); bp = NULL; } } vd_end: if (bp != NULL) { brelse(bp); bp = NULL; } if (pri == NULL) { error = EINVAL; goto out; } logical_block_size = isonum_723 (high_sierra? pri_sierra->logical_block_size: pri->logical_block_size); if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE || (logical_block_size & (logical_block_size - 1)) != 0) { error = EINVAL; goto out; } rootp = (struct iso_directory_record *) (high_sierra? pri_sierra->root_directory_record: pri->root_directory_record); isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK | M_ZERO); isomp->im_cp = cp; isomp->im_bo = bo; isomp->logical_block_size = logical_block_size; isomp->volume_space_size = isonum_733 (high_sierra? pri_sierra->volume_space_size: pri->volume_space_size); isomp->joliet_level = 0; /* * Since an ISO9660 multi-session CD can also access previous * sessions, we have to include them into the space consider- * ations. This doesn't yield a very accurate number since * parts of the old sessions might be inaccessible now, but we * can't do much better. This is also important for the NFS * filehandle validation. */ isomp->volume_space_size += ssector; memcpy(isomp->root, rootp, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->im_bmask = logical_block_size - 1; isomp->im_bshift = ffs(logical_block_size) - 1; pribp->b_flags |= B_AGE; brelse(pribp); pribp = NULL; rootp = NULL; pri = NULL; pri_sierra = NULL; mp->mnt_data = isomp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; - mp->mnt_maxsymlinklen = 0; MNT_ILOCK(mp); if (isverified) mp->mnt_flag |= MNT_VERIFIED; mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED; MNT_IUNLOCK(mp); isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP); vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS); vfs_flagopt(mp->mnt_optnew, "extatt", &isomp->im_flags, ISOFSMNT_EXTATT); vfs_flagopt(mp->mnt_optnew, "nojoliet", &isomp->im_flags, ISOFSMNT_NOJOLIET); vfs_flagopt(mp->mnt_optnew, "kiconv", &isomp->im_flags, ISOFSMNT_KICONV); /* Check the Rock Ridge Extension support */ if (!(isomp->im_flags & ISOFSMNT_NORRIP)) { if ((error = bread(isomp->im_devvp, (isomp->root_extent + isonum_711(((struct iso_directory_record *)isomp->root)-> ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT), isomp->logical_block_size, NOCRED, &bp)) != 0) goto out; rootp = (struct iso_directory_record *)bp->b_data; if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { isomp->im_flags |= ISOFSMNT_NORRIP; } else { isomp->im_flags &= ~ISOFSMNT_GENS; } /* * The contents are valid, * but they will get reread as part of another vnode, so... */ bp->b_flags |= B_AGE; brelse(bp); bp = NULL; rootp = NULL; } if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) { cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error); if (error) goto out; cs_disk = vfs_getopts(mp->mnt_optnew, "cs_disk", &error); if (error) goto out; cd9660_iconv->open(cs_local, cs_disk, &isomp->im_d2l); cd9660_iconv->open(cs_disk, cs_local, &isomp->im_l2d); } else { isomp->im_d2l = NULL; isomp->im_l2d = NULL; } if (high_sierra) { /* this effectively ignores all the mount flags */ if (bootverbose) log(LOG_INFO, "cd9660: High Sierra Format\n"); isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA; } else switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) { default: isomp->iso_ftype = ISO_FTYPE_DEFAULT; break; case ISOFSMNT_GENS|ISOFSMNT_NORRIP: isomp->iso_ftype = ISO_FTYPE_9660; break; case 0: if (bootverbose) log(LOG_INFO, "cd9660: RockRidge Extension\n"); isomp->iso_ftype = ISO_FTYPE_RRIP; break; } /* Decide whether to use the Joliet descriptor */ if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) { if (bootverbose) log(LOG_INFO, "cd9660: Joliet Extension (Level %d)\n", joliet_level); rootp = (struct iso_directory_record *) sup->root_directory_record; memcpy(isomp->root, rootp, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->joliet_level = joliet_level; supbp->b_flags |= B_AGE; } if (supbp) { brelse(supbp); supbp = NULL; sup = NULL; } return 0; out: if (bp != NULL) brelse(bp); if (pribp != NULL) brelse(pribp); if (supbp != NULL) brelse(supbp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (isomp) { free(isomp, M_ISOFSMNT); mp->mnt_data = NULL; } dev_rel(dev); return error; } /* * unmount system call */ static int cd9660_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct iso_mnt *isomp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, 0, flags, curthread))) return (error); isomp = VFSTOISOFS(mp); if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) { if (isomp->im_d2l) cd9660_iconv->close(isomp->im_d2l); if (isomp->im_l2d) cd9660_iconv->close(isomp->im_l2d); } g_topology_lock(); g_vfs_close(isomp->im_cp); g_topology_unlock(); vrele(isomp->im_devvp); dev_rel(isomp->im_dev); free(isomp, M_ISOFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Return root of a filesystem */ static int cd9660_root(mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = (struct iso_directory_record *)imp->root; cd_ino_t ino = isodirino(dp, imp); /* * With RRIP we must use the `.' entry of the root directory. * Simply tell vget, that it's a relocated directory. */ return (cd9660_vget_internal(mp, ino, flags, vpp, imp->iso_ftype == ISO_FTYPE_RRIP, dp)); } /* * Get filesystem statistics. */ static int cd9660_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct iso_mnt *isomp; isomp = VFSTOISOFS(mp); sbp->f_bsize = isomp->logical_block_size; sbp->f_iosize = sbp->f_bsize; /* XXX */ sbp->f_blocks = isomp->volume_space_size; sbp->f_bfree = 0; /* total free blocks */ sbp->f_bavail = 0; /* blocks free for non superuser */ sbp->f_files = 0; /* total files */ sbp->f_ffree = 0; /* free file nodes */ return 0; } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is in range * - call iget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the generation number matches */ /* ARGSUSED */ static int cd9660_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ifid ifh; struct iso_node *ip; struct vnode *nvp; int error; memcpy(&ifh, fhp, sizeof(ifh)); #ifdef ISOFS_DBG printf("fhtovp: ino %d, start %ld\n", ifh.ifid_ino, ifh.ifid_start); #endif if ((error = VFS_VGET(mp, ifh.ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->inode.iso_mode == 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, ip->i_size, curthread); return (0); } /* * Conform to standard VFS interface; can't vget arbitrary inodes beyond 4GB * into media with current inode scheme and 32-bit ino_t. This shouldn't be * needed for anything other than nfsd, and who exports a mounted DVD over NFS? */ static int cd9660_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { /* * XXXX * It would be nice if we didn't always set the `relocated' flag * and force the extra read, but I don't want to think about fixing * that right now. */ return (cd9660_vget_internal(mp, ino, flags, vpp, #if 0 VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP, #else 0, #endif (struct iso_directory_record *)0)); } /* Use special comparator for full 64-bit ino comparison. */ static int cd9660_vfs_hash_cmp(vp, pino) struct vnode *vp; void *pino; { struct iso_node *ip; cd_ino_t ino; ip = VTOI(vp); ino = *(cd_ino_t *)pino; return (ip->i_number != ino); } int cd9660_vget_internal(mp, ino, flags, vpp, relocated, isodir) struct mount *mp; cd_ino_t ino; int flags; struct vnode **vpp; int relocated; struct iso_directory_record *isodir; { struct iso_mnt *imp; struct iso_node *ip; struct buf *bp; struct vnode *vp; int error; struct thread *td; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, cd9660_vfs_hash_cmp, &ino); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ imp = VFSTOISOFS(mp); /* Allocate a new vnode/iso_node. */ if ((error = getnewvnode("isofs", mp, &cd9660_vnodeops, &vp)) != 0) { *vpp = NULLVP; return (error); } ip = malloc(sizeof(struct iso_node), M_ISOFSNODE, M_WAITOK | M_ZERO); vp->v_data = ip; ip->i_vnode = vp; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { free(ip, M_ISOFSNODE); *vpp = NULLVP; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, cd9660_vfs_hash_cmp, &ino); if (error || *vpp != NULL) return (error); if (isodir == NULL) { int lbn, off; lbn = lblkno(imp, ino); if (lbn >= imp->volume_space_size) { vput(vp); printf("fhtovp: lbn exceed volume space %d\n", lbn); return (ESTALE); } off = blkoff(imp, ino); if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { vput(vp); printf("fhtovp: crosses block boundary %d\n", off + ISO_DIRECTORY_RECORD_SIZE); return (ESTALE); } error = bread(imp->im_devvp, lbn << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { vput(vp); printf("fhtovp: bread error %d\n",error); return (error); } isodir = (struct iso_directory_record *)(bp->b_data + off); if (off + isonum_711(isodir->length) > imp->logical_block_size) { vput(vp); brelse(bp); printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", off +isonum_711(isodir->length), off, isonum_711(isodir->length)); return (ESTALE); } #if 0 if (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) { brelse(bp); printf("fhtovp: file start miss %d vs %d\n", isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length), ifhp->ifid_start); return (ESTALE); } #endif } else bp = NULL; ip->i_mnt = imp; if (relocated) { /* * On relocated directories we must * read the `.' entry out of a dir. */ ip->iso_start = ino >> imp->im_bshift; if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) { vput(vp); return (error); } isodir = (struct iso_directory_record *)bp->b_data; } ip->iso_extent = isonum_733(isodir->extent); ip->i_size = isonum_733(isodir->size); ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; /* * Setup time stamp, attribute */ vp->v_type = VNON; switch (imp->iso_ftype) { default: /* ISO_FTYPE_9660 */ { struct buf *bp2; int off; if ((imp->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL, &bp2); else bp2 = NULL; cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660); cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660); if (bp2) brelse(bp2); break; } case ISO_FTYPE_RRIP: cd9660_rrip_analyze(isodir, ip, imp); break; } brelse(bp); /* * Initialize the associated vnode */ switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) { case VFIFO: vp->v_op = &cd9660_fifoops; break; default: VN_LOCK_ASHARE(vp); break; } if (ip->iso_extent == imp->root_extent) vp->v_vflag |= VV_ROOT; /* * XXX need generation number? */ *vpp = vp; return (0); } diff --git a/sys/fs/ext2fs/ext2_inode.c b/sys/fs/ext2fs/ext2_inode.c index 1e7584ebaa66..090caf783a3b 100644 --- a/sys/fs/ext2fs/ext2_inode.c +++ b/sys/fs/ext2fs/ext2_inode.c @@ -1,644 +1,644 @@ /*- * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.5 (Berkeley) 12/30/93 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode * to disk if the IN_MODIFIED flag is set (it may be set initially, or by * the timestamp update). The IN_LAZYMOD flag is set to force a write * later if not now. If we write now, then clear both IN_MODIFIED and * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is * set, then wait for the write to complete. */ int ext2_update(struct vnode *vp, int waitfor) { struct m_ext2fs *fs; struct buf *bp; struct inode *ip; int error; ASSERT_VOP_ELOCKED(vp, "ext2_update"); ext2_itimes(vp); ip = VTOI(vp); if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); fs = ip->i_e2fs; if (fs->e2fs_ronly) return (0); if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); return (error); } error = ext2_i2ei(ip, (struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number))); if (error) { brelse(bp); return (error); } if (waitfor && !DOINGASYNC(vp)) return (bwrite(bp)); else { bdwrite(bp); return (0); } } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * NB: triple indirect blocks are untested. */ static int ext2_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn, int level, e4fs_daddr_t *countp) { struct buf *bp; struct m_ext2fs *fs = ip->i_e2fs; struct vnode *vp; e2fs_daddr_t *bap, *copy; int i, nblocks, error = 0, allerror = 0; e2fs_lbn_t nb, nlbn, last; e4fs_daddr_t blkcount, factor, blocksreleased = 0; /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->e2fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0, 0); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { bp->b_iocmd = BIO_READ; if (bp->b_bcount > bp->b_bufsize) panic("ext2_indirtrunc: bad buffer size"); bp->b_blkno = dbn; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); error = bufwait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } bap = (e2fs_daddr_t *)bp->b_data; copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->e2fs_bsize); bzero((caddr_t)&bap[last + 1], (NINDIR(fs) - (last + 1)) * sizeof(e2fs_daddr_t)); if (last == -1) bp->b_flags |= B_INVAL; if (DOINGASYNC(vp)) { bdwrite(bp); } else { error = bwrite(bp); if (error) allerror = error; } bap = copy; /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = le32toh(bap[i]); if (nb == 0) continue; if (level > SINGLE) { if ((error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (int32_t)-1, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } ext2_blkfree(ip, nb, fs->e2fs_bsize); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = le32toh(bap[i]); if (nb != 0) { if ((error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } } free(copy, M_TEMP); *countp = blocksreleased; return (allerror); } /* * Truncate the inode oip to at most length size, freeing the * disk blocks. */ static int ext2_ind_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct thread *td) { struct vnode *ovp = vp; e4fs_daddr_t lastblock; struct inode *oip; e4fs_daddr_t bn, lbn, lastiblock[EXT2_NIADDR], indir_lbn[EXT2_NIADDR]; uint32_t oldblks[EXT2_NDADDR + EXT2_NIADDR]; uint32_t newblks[EXT2_NDADDR + EXT2_NIADDR]; struct m_ext2fs *fs; struct buf *bp; int offset, size, level; e4fs_daddr_t count, nblocks, blocksreleased = 0; int error, i, allerror; off_t osize; #ifdef INVARIANTS struct bufobj *bo; #endif oip = VTOI(ovp); #ifdef INVARIANTS bo = &ovp->v_bufobj; #endif fs = oip->i_e2fs; osize = oip->i_size; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { if (length > oip->i_e2fs->e2fs_maxfilesize) return (EFBIG); vnode_pager_setsize(ovp, length); offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags); if (error) { vnode_pager_setsize(vp, osize); return (error); } oip->i_size = length; if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(ovp, !DOINGASYNC(ovp))); } /* * Shorten the size of the file. If the file is not being * truncated to a block boundary, the contents of the * partial block following the end of the file must be * zero'ed in case it ever become accessible again because * of subsequent file growth. */ /* I don't understand the comment above */ offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset, cred, &bp, flags); if (error) return (error); oip->i_size = length; size = blksize(fs, oip, lbn); bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1; lastiblock[SINGLE] = lastblock - EXT2_NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->e2fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ext2_indirtrunc below. */ for (level = TRIPLE; level >= SINGLE; level--) { oldblks[EXT2_NDADDR + level] = oip->i_ib[level]; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; lastiblock[level] = -1; } } for (i = 0; i < EXT2_NDADDR; i++) { oldblks[i] = oip->i_db[i]; if (i > lastblock) oip->i_db[i] = 0; } oip->i_flag |= IN_CHANGE | IN_UPDATE; allerror = ext2_update(ovp, !DOINGASYNC(ovp)); /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ for (i = 0; i < EXT2_NDADDR; i++) { newblks[i] = oip->i_db[i]; oip->i_db[i] = oldblks[i]; } for (i = 0; i < EXT2_NIADDR; i++) { newblks[EXT2_NDADDR + i] = oip->i_ib[i]; oip->i_ib[i] = oldblks[EXT2_NDADDR + i]; } oip->i_size = osize; error = vtruncbuf(ovp, length, (int)fs->e2fs_bsize); if (error && (allerror == 0)) allerror = error; vnode_pager_setsize(ovp, length); /* * Indirect blocks first. */ indir_lbn[SINGLE] = -EXT2_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = oip->i_ib[level]; if (bn != 0) { error = ext2_indirtrunc(oip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; ext2_blkfree(oip, bn, fs->e2fs_fsize); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = EXT2_NDADDR - 1; i > lastblock; i--) { long bsize; bn = oip->i_db[i]; if (bn == 0) continue; oip->i_db[i] = 0; bsize = blksize(fs, oip, i); ext2_blkfree(oip, bn, bsize); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = oip->i_db[lastblock]; if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, oip, lastblock); oip->i_size = length; newspace = blksize(fs, oip, lastblock); if (newspace == 0) panic("ext2_truncate: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ext2_blkfree(oip, bn, oldspace - newspace); blocksreleased += btodb(oldspace - newspace); } } done: #ifdef INVARIANTS for (level = SINGLE; level <= TRIPLE; level++) if (newblks[EXT2_NDADDR + level] != oip->i_ib[level]) panic("itrunc1"); for (i = 0; i < EXT2_NDADDR; i++) if (newblks[i] != oip->i_db[i]) panic("itrunc2"); BO_LOCK(bo); if (length == 0 && (bo->bo_dirty.bv_cnt != 0 || bo->bo_clean.bv_cnt != 0)) panic("itrunc3"); BO_UNLOCK(bo); #endif /* INVARIANTS */ /* * Put back the real size. */ oip->i_size = length; if (oip->i_blocks >= blocksreleased) oip->i_blocks -= blocksreleased; else /* sanity */ oip->i_blocks = 0; oip->i_flag |= IN_CHANGE; vnode_pager_setsize(ovp, length); return (allerror); } static int ext2_ext_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct thread *td) { struct vnode *ovp = vp; int32_t lastblock; struct m_ext2fs *fs; struct inode *oip; struct buf *bp; uint32_t lbn, offset; int error, size; off_t osize; oip = VTOI(ovp); fs = oip->i_e2fs; osize = oip->i_size; if (osize < length) { if (length > oip->i_e2fs->e2fs_maxfilesize) { return (EFBIG); } vnode_pager_setsize(ovp, length); offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags); if (error) { vnode_pager_setsize(vp, osize); return (error); } oip->i_size = length; if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(ovp, !DOINGASYNC(ovp))); } lastblock = (length + fs->e2fs_bsize - 1) / fs->e2fs_bsize; error = ext4_ext_remove_space(oip, lastblock, flags, cred, td); if (error) return (error); offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset, cred, &bp, flags); if (error) { return (error); } oip->i_size = length; size = blksize(fs, oip, lbn); bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); } oip->i_size = osize; error = vtruncbuf(ovp, length, (int)fs->e2fs_bsize); if (error) return (error); vnode_pager_setsize(ovp, length); oip->i_size = length; oip->i_flag |= IN_CHANGE | IN_UPDATE; error = ext2_update(ovp, !DOINGASYNC(ovp)); return (error); } /* * Truncate the inode ip to at most length size, freeing the * disk blocks. */ int ext2_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct thread *td) { struct inode *ip; int error; ASSERT_VOP_LOCKED(vp, "ext2_truncate"); if (length < 0) return (EINVAL); ip = VTOI(vp); if (vp->v_type == VLNK && - ip->i_size < vp->v_mount->mnt_maxsymlinklen) { + ip->i_size < VFSTOEXT2(vp->v_mount)->um_e2fs->e2fs_maxsymlinklen) { #ifdef INVARIANTS if (length != 0) panic("ext2_truncate: partial truncate of symlink"); #endif bzero((char *)&ip->i_shortlink, (u_int)ip->i_size); ip->i_size = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(vp, 1)); } if (ip->i_size == length) { ip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(vp, 0)); } if (ip->i_flag & IN_E4EXTENTS) error = ext2_ext_truncate(vp, length, flags, cred, td); else error = ext2_ind_truncate(vp, length, flags, cred, td); return (error); } /* * discard preallocated blocks */ int ext2_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct thread *td = curthread; int mode, error = 0; /* * Ignore inodes related to stale file handles. */ if (ip->i_mode == 0) goto out; if (ip->i_nlink <= 0) { ext2_extattr_free(ip); error = ext2_truncate(vp, (off_t)0, 0, NOCRED, td); if (!(ip->i_flag & IN_E4EXTENTS)) ip->i_rdev = 0; mode = ip->i_mode; ip->i_mode = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; ext2_vfree(vp, ip->i_number, mode); } if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) ext2_update(vp, 0); out: /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ if (ip->i_mode == 0) vrecycle(vp); return (error); } /* * Reclaim an inode so that it can be used for other purposes. */ int ext2_reclaim(struct vop_reclaim_args *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; ip = VTOI(vp); if (ip->i_flag & IN_LAZYMOD) { ip->i_flag |= IN_MODIFIED; ext2_update(vp, 0); } vfs_hash_remove(vp); free(vp->v_data, M_EXT2NODE); vp->v_data = 0; return (0); } diff --git a/sys/fs/ext2fs/ext2_vfsops.c b/sys/fs/ext2fs/ext2_vfsops.c index a64e28186769..3da8ca6331bc 100644 --- a/sys/fs/ext2fs/ext2_vfsops.c +++ b/sys/fs/ext2fs/ext2_vfsops.c @@ -1,1443 +1,1443 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , vfsops, trace, "int", "char*"); SDT_PROBE_DEFINE2(ext2fs, , vfsops, ext2_cg_validate_error, "char*", "int"); SDT_PROBE_DEFINE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "char*"); static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td); static int ext2_mountfs(struct vnode *, struct mount *); static int ext2_reload(struct mount *mp, struct thread *td); static int ext2_sbupdate(struct ext2mount *, int); static int ext2_cgupdate(struct ext2mount *, int); static vfs_unmount_t ext2_unmount; static vfs_root_t ext2_root; static vfs_statfs_t ext2_statfs; static vfs_sync_t ext2_sync; static vfs_vget_t ext2_vget; static vfs_fhtovp_t ext2_fhtovp; static vfs_mount_t ext2_mount; MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part"); static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure"); static struct vfsops ext2fs_vfsops = { .vfs_fhtovp = ext2_fhtovp, .vfs_mount = ext2_mount, .vfs_root = ext2_root, /* root inode via vget */ .vfs_statfs = ext2_statfs, .vfs_sync = ext2_sync, .vfs_unmount = ext2_unmount, .vfs_vget = ext2_vget, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly); static int ext2_compute_sb_data(struct vnode * devvp, struct ext2fs * es, struct m_ext2fs * fs); static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "multilabel", "suiddir", "nosymfollow", "sync", "union", NULL }; /* * VFS Operations. * * mount system call */ static int ext2_mount(struct mount *mp) { struct vfsoptlist *opts; struct vnode *devvp; struct thread *td; struct ext2mount *ump = NULL; struct m_ext2fs *fs; struct nameidata nd, *ndp = &nd; accmode_t accmode; char *path, *fspec; int error, flags, len; td = curthread; opts = mp->mnt_optnew; if (vfs_filteropt(opts, ext2_opts)) return (EINVAL); vfs_getopt(opts, "fspath", (void **)&path, NULL); /* Double-check the length of path.. */ if (strlen(path) >= MAXMNTLEN) return (ENAMETOOLONG); fspec = NULL; error = vfs_getopt(opts, "from", (void **)&fspec, &len); if (!error && fspec[len - 1] != '\0') return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOEXT2(mp); fs = ump->um_e2fs; error = 0; if (fs->e2fs_ronly == 0 && vfs_flagopt(opts, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = ext2_flushfiles(mp, flags, td); if (error == 0 && fs->e2fs_wasvalid && ext2_cgupdate(ump, MNT_WAIT) == 0) { fs->e2fs->e2fs_state = htole16((le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN)); ext2_sbupdate(ump, MNT_WAIT); } fs->e2fs_ronly = 1; vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, td); if (error) return (error); devvp = ump->um_devvp; if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) { if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0)) return (EPERM); /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp); return (error); } VOP_UNLOCK(devvp); g_topology_lock(); error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) return (error); if ((le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN) == 0 || (le16toh(fs->e2fs->e2fs_state) & E2FS_ERRORS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->e2fs_fsmnt); return (EPERM); } } fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN); (void)ext2_cgupdate(ump, MNT_WAIT); fs->e2fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } if (vfs_flagopt(opts, "export", NULL, 0)) { /* Process export requests in vfs_mount.c. */ return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (fspec == NULL) return (EINVAL); NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. * * XXXRW: VOP_ACCESS() enough? */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = ext2_mountfs(devvp, mp); } else { if (devvp != ump->um_devvp) { vput(devvp); return (EINVAL); /* needs translation */ } else vput(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOEXT2(mp); fs = ump->um_e2fs; /* * Note that this strncpy() is ok because of a check at the start * of ext2_mount(). */ strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN); fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0'; vfs_mountedfrom(mp, fspec); return (0); } static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly) { uint32_t i, mask; if (le16toh(es->e2fs_magic) != E2FS_MAGIC) { printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n", devtoname(dev), le16toh(es->e2fs_magic), E2FS_MAGIC); return (1); } if (le32toh(es->e2fs_rev) > E2FS_REV0) { mask = le32toh(es->e2fs_features_incompat) & ~(EXT2F_INCOMPAT_SUPP); if (mask) { printf("WARNING: mount of %s denied due to " "unsupported optional features:\n", devtoname(dev)); for (i = 0; i < sizeof(incompat)/sizeof(struct ext2_feature); i++) if (mask & incompat[i].mask) printf("%s ", incompat[i].name); printf("\n"); return (1); } mask = le32toh(es->e2fs_features_rocompat) & ~EXT2F_ROCOMPAT_SUPP; if (!ronly && mask) { printf("WARNING: R/W mount of %s denied due to " "unsupported optional features:\n", devtoname(dev)); for (i = 0; i < sizeof(ro_compat)/sizeof(struct ext2_feature); i++) if (mask & ro_compat[i].mask) printf("%s ", ro_compat[i].name); printf("\n"); return (1); } } return (0); } static e4fs_daddr_t ext2_cg_location(struct m_ext2fs *fs, int number) { int cg, descpb, logical_sb, has_super = 0; /* * Adjust logical superblock block number. * Godmar thinks: if the blocksize is greater than 1024, then * the superblock is logically part of block zero. */ logical_sb = fs->e2fs_bsize > SBSIZE ? 0 : 1; if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_META_BG) || number < le32toh(fs->e2fs->e3fs_first_meta_bg)) return (logical_sb + number + 1); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) descpb = fs->e2fs_bsize / sizeof(struct ext2_gd); else descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE; cg = descpb * number; if (ext2_cg_has_sb(fs, cg)) has_super = 1; return (has_super + cg * (e4fs_daddr_t)EXT2_BLOCKS_PER_GROUP(fs) + le32toh(fs->e2fs->e2fs_first_dblock)); } static int ext2_cg_validate(struct m_ext2fs *fs) { uint64_t b_bitmap; uint64_t i_bitmap; uint64_t i_tables; uint64_t first_block, last_block, last_cg_block; struct ext2_gd *gd; unsigned int i, cg_count; first_block = le32toh(fs->e2fs->e2fs_first_dblock); last_cg_block = ext2_cg_number_gdb(fs, 0); cg_count = fs->e2fs_gcount; for (i = 0; i < fs->e2fs_gcount; i++) { gd = &fs->e2fs_gd[i]; if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG) || i == fs->e2fs_gcount - 1) { last_block = fs->e2fs_bcount - 1; } else { last_block = first_block + (EXT2_BLOCKS_PER_GROUP(fs) - 1); } if ((cg_count == fs->e2fs_gcount) && !(le16toh(gd->ext4bgd_flags) & EXT2_BG_INODE_ZEROED)) cg_count = i; b_bitmap = e2fs_gd_get_b_bitmap(gd); if (b_bitmap == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap is zero", i); return (EINVAL); } if (b_bitmap <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap overlaps gds", i); return (EINVAL); } if (b_bitmap < first_block || b_bitmap > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap not in group", i); return (EINVAL); } i_bitmap = e2fs_gd_get_i_bitmap(gd); if (i_bitmap == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap is zero", i); return (EINVAL); } if (i_bitmap <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap overlaps gds", i); return (EINVAL); } if (i_bitmap < first_block || i_bitmap > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap not in group blk", i); return (EINVAL); } i_tables = e2fs_gd_get_i_tables(gd); if (i_tables == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode table is zero", i); return (EINVAL); } if (i_tables <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode talbes overlaps gds", i); return (EINVAL); } if (i_tables < first_block || i_tables + fs->e2fs_itpg - 1 > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode tables not in group blk", i); return (EINVAL); } if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG)) first_block += EXT2_BLOCKS_PER_GROUP(fs); } return (0); } /* * This computes the fields of the m_ext2fs structure from the * data in the ext2fs structure read in. */ static int ext2_compute_sb_data(struct vnode *devvp, struct ext2fs *es, struct m_ext2fs *fs) { struct buf *bp; uint32_t e2fs_descpb, e2fs_gdbcount_alloc; int i, j; int g_count = 0; int error; /* Check checksum features */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) && EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "incorrect checksum features combination"); return (EINVAL); } /* Precompute checksum seed for all metadata */ ext2_sb_csum_set_seed(fs); /* Verify sb csum if possible */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { error = ext2_sb_csum_verify(fs); if (error) { return (error); } } /* Check for block size = 1K|2K|4K */ if (le32toh(es->e2fs_log_bsize) > 2) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "bad block size"); return (EINVAL); } fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + le32toh(es->e2fs_log_bsize); fs->e2fs_bsize = 1U << fs->e2fs_bshift; fs->e2fs_fsbtodb = le32toh(es->e2fs_log_bsize) + 1; fs->e2fs_qbmask = fs->e2fs_bsize - 1; /* Check for fragment size */ if (le32toh(es->e2fs_log_fsize) > (EXT2_MAX_FRAG_LOG_SIZE - EXT2_MIN_BLOCK_LOG_SIZE)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid log cluster size"); return (EINVAL); } fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << le32toh(es->e2fs_log_fsize); if (fs->e2fs_fsize != fs->e2fs_bsize) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "fragment size != block size"); return (EINVAL); } fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize; /* Check reserved gdt blocks for future filesystem expansion */ if (le16toh(es->e2fs_reserved_ngdb) > (fs->e2fs_bsize / 4)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "number of reserved GDT blocks too large"); return (EINVAL); } if (le32toh(es->e2fs_rev) == E2FS_REV0) { fs->e2fs_isize = E2FS_REV0_INODE_SIZE; } else { fs->e2fs_isize = le16toh(es->e2fs_inode_size); /* * Check first ino. */ if (le32toh(es->e2fs_first_ino) < EXT2_FIRSTINO) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid first ino"); return (EINVAL); } /* * Simple sanity check for superblock inode size value. */ if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE || EXT2_INODE_SIZE(fs) > fs->e2fs_bsize || (fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid inode size"); return (EINVAL); } } /* Check group descriptors */ if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT) && le16toh(es->e3fs_desc_size) != E2FS_64BIT_GD_SIZE) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "unsupported 64bit descriptor size"); return (EINVAL); } fs->e2fs_bpg = le32toh(es->e2fs_bpg); fs->e2fs_fpg = le32toh(es->e2fs_fpg); if (fs->e2fs_bpg == 0 || fs->e2fs_fpg == 0) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "zero blocks/fragments per group"); return (EINVAL); } else if (fs->e2fs_bpg != fs->e2fs_fpg) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "blocks per group not equal fragments per group"); return (EINVAL); } if (fs->e2fs_bpg != fs->e2fs_bsize * 8) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "non-standard group size unsupported"); return (EINVAL); } fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs); if (fs->e2fs_ipb == 0 || fs->e2fs_ipb > fs->e2fs_bsize / E2FS_REV0_INODE_SIZE) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "bad inodes per block size"); return (EINVAL); } fs->e2fs_ipg = le32toh(es->e2fs_ipg); if (fs->e2fs_ipg < fs->e2fs_ipb || fs->e2fs_ipg > fs->e2fs_bsize * 8) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid inodes per group"); return (EINVAL); } fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb; fs->e2fs_bcount = le32toh(es->e2fs_bcount); fs->e2fs_rbcount = le32toh(es->e2fs_rbcount); fs->e2fs_fbcount = le32toh(es->e2fs_fbcount); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { fs->e2fs_bcount |= (uint64_t)(le32toh(es->e4fs_bcount_hi)) << 32; fs->e2fs_rbcount |= (uint64_t)(le32toh(es->e4fs_rbcount_hi)) << 32; fs->e2fs_fbcount |= (uint64_t)(le32toh(es->e4fs_fbcount_hi)) << 32; } if (fs->e2fs_rbcount > fs->e2fs_bcount || fs->e2fs_fbcount > fs->e2fs_bcount) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid block count"); return (EINVAL); } fs->e2fs_ficount = le32toh(es->e2fs_ficount); if (fs->e2fs_ficount > le32toh(es->e2fs_icount)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid number of free inodes"); return (EINVAL); } if (le32toh(es->e2fs_first_dblock) >= fs->e2fs_bcount) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "first data block out of range"); return (EINVAL); } fs->e2fs_gcount = howmany(fs->e2fs_bcount - le32toh(es->e2fs_first_dblock), EXT2_BLOCKS_PER_GROUP(fs)); if (fs->e2fs_gcount > ((uint64_t)1 << 32) - EXT2_DESCS_PER_BLOCK(fs)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "groups count too large"); return (EINVAL); } /* Check for extra isize in big inodes. */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) && EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "no space for extra inode timestamps"); return (EINVAL); } /* s_resuid / s_resgid ? */ if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { e2fs_descpb = fs->e2fs_bsize / E2FS_64BIT_GD_SIZE; e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, e2fs_descpb); } else { e2fs_descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE; e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, fs->e2fs_bsize / sizeof(struct ext2_gd)); } fs->e2fs_gdbcount = howmany(fs->e2fs_gcount, e2fs_descpb); fs->e2fs_gd = malloc(e2fs_gdbcount_alloc * fs->e2fs_bsize, M_EXT2MNT, M_WAITOK | M_ZERO); fs->e2fs_contigdirs = malloc(fs->e2fs_gcount * sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK | M_ZERO); for (i = 0; i < fs->e2fs_gdbcount; i++) { error = bread(devvp, fsbtodb(fs, ext2_cg_location(fs, i)), fs->e2fs_bsize, NOCRED, &bp); if (error) { /* * fs->e2fs_gd and fs->e2fs_contigdirs * will be freed later by the caller, * because this function could be called from * MNT_UPDATE path. */ return (error); } if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { memcpy(&fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], bp->b_data, fs->e2fs_bsize); } else { for (j = 0; j < e2fs_descpb && g_count < fs->e2fs_gcount; j++, g_count++) memcpy(&fs->e2fs_gd[g_count], bp->b_data + j * E2FS_REV0_GD_SIZE, E2FS_REV0_GD_SIZE); } brelse(bp); bp = NULL; } /* Validate cgs consistency */ error = ext2_cg_validate(fs); if (error) return (error); /* Verfy cgs csum */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) || EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { error = ext2_gd_csum_verify(fs, devvp->v_rdev); if (error) return (error); } /* Initialization for the ext2 Orlov allocator variant. */ fs->e2fs_total_dir = 0; for (i = 0; i < fs->e2fs_gcount; i++) fs->e2fs_total_dir += e2fs_gd_get_ndirs(&fs->e2fs_gd[i]); if (le32toh(es->e2fs_rev) == E2FS_REV0 || !EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE)) fs->e2fs_maxfilesize = 0x7fffffff; else { fs->e2fs_maxfilesize = 0xffffffffffff; if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_HUGE_FILE)) fs->e2fs_maxfilesize = 0x7fffffffffffffff; } if (le32toh(es->e4fs_flags) & E2FS_UNSIGNED_HASH) { fs->e2fs_uhash = 3; } else if ((le32toh(es->e4fs_flags) & E2FS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_UNSIGNED_HASH); fs->e2fs_uhash = 3; #else es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_SIGNED_HASH); #endif } if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) error = ext2_sb_csum_verify(fs); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) invalidate all cluster summary information. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. * XXX we are missing some steps, in particular # 3, this has to be reviewed. */ static int ext2_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; struct buf *bp; struct ext2fs *es; struct m_ext2fs *fs; struct csum *sump; int error, i; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOEXT2(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ext2_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOEXT2(mp)->um_e2fs; bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs)); if ((error = ext2_compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return (error); } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); /* * Step 3: invalidate all cluster summary information. */ if (fs->e2fs_contigsumsize > 0) { lp = fs->e2fs_maxcluster; sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) { *lp++ = fs->e2fs_contigsumsize; sump->cs_init = 0; bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1); } } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp); vrele(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip); brelse(bp); VOP_UNLOCK(vp); vrele(vp); if (error) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } } return (0); } /* * Common code for mount and mountroot. */ static int ext2_mountfs(struct vnode *devvp, struct mount *mp) { struct ext2mount *ump; struct buf *bp; struct m_ext2fs *fs; struct ext2fs *es; struct cdev *dev = devvp->v_rdev; struct g_consumer *cp; struct bufobj *bo; struct csum *sump; int error; int ronly; int i; u_long size; int32_t *lp; int32_t e2fs_maxcontig; ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0); /* XXX: use VOP_ACESS to check FS perms */ g_topology_lock(); error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1); g_topology_unlock(); VOP_UNLOCK(devvp); if (error) return (error); /* XXX: should we check for some sectorsize or 512 instead? */ if (((SBSIZE % cp->provider->sectorsize) != 0) || (SBSIZE < cp->provider->sectorsize)) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); return (EINVAL); } bo = &devvp->v_bufobj; bo->bo_private = cp; bo->bo_ops = g_vfs_bufops; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > maxphys) mp->mnt_iosize_max = maxphys; bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, dev, ronly) != 0) { error = EINVAL; /* XXX needs translation */ goto out; } if ((le16toh(es->e2fs_state) & E2FS_ISCLEAN) == 0 || (le16toh(es->e2fs_state) & E2FS_ERRORS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK | M_ZERO); /* * I don't know whether this is the right strategy. Note that * we dynamically allocate both an m_ext2fs and an ext2fs * while Linux keeps the super block in a locked buffer. */ ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_EXT2MNT, M_WAITOK | M_ZERO); ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs), M_EXT2MNT, M_WAITOK); mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF); bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs)); if ((error = ext2_compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs))) goto out; /* * Calculate the maximum contiguous blocks and size of cluster summary * array. In FFS this is done by newfs; however, the superblock * in ext2fs doesn't have these variables, so we can calculate * them here. */ e2fs_maxcontig = MAX(1, maxphys / ump->um_e2fs->e2fs_bsize); ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG); + ump->um_e2fs->e2fs_maxsymlinklen = EXT2_MAXSYMLINKLEN; if (ump->um_e2fs->e2fs_contigsumsize > 0) { size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t); ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK); size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum); ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK); lp = ump->um_e2fs->e2fs_maxcluster; sump = ump->um_e2fs->e2fs_clustersum; for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) { *lp++ = ump->um_e2fs->e2fs_contigsumsize; sump->cs_init = 0; sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) * sizeof(int32_t), M_EXT2MNT, M_WAITOK | M_ZERO); } } brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->e2fs_ronly = ronly; /* ronly is set according to mnt_flags */ /* * If the fs is not mounted read-only, make sure the super block is * always written back on a sync(). */ fs->e2fs_wasvalid = le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN ? 1 : 0; if (ronly == 0) { fs->e2fs_fmod = 1; /* mark it modified and set fs invalid */ fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN); } mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; - mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_bo = &devvp->v_bufobj; ump->um_cp = cp; /* * Setting those two parameters allowed us to use * ufs_bmap w/o changse! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = le32toh(fs->e2fs->e2fs_log_bsize) + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); /* * Initialize filesystem stat information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); return (0); out: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(EXT2_MTX(ump)); free(ump->um_e2fs->e2fs_gd, M_EXT2MNT); free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT); free(ump->um_e2fs->e2fs, M_EXT2MNT); free(ump->um_e2fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; } return (error); } /* * Unmount system call. */ static int ext2_unmount(struct mount *mp, int mntflags) { struct ext2mount *ump; struct m_ext2fs *fs; struct csum *sump; int error, flags, i, ronly; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, curthread)) != 0) return (error); ump = VFSTOEXT2(mp); fs = ump->um_e2fs; ronly = fs->e2fs_ronly; if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) { if (fs->e2fs_wasvalid) fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN); ext2_sbupdate(ump, MNT_WAIT); } g_topology_lock(); g_vfs_close(ump->um_cp); g_topology_unlock(); vrele(ump->um_devvp); sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) free(sump->cs_sum, M_EXT2MNT); free(fs->e2fs_clustersum, M_EXT2MNT); free(fs->e2fs_maxcluster, M_EXT2MNT); free(fs->e2fs_gd, M_EXT2MNT); free(fs->e2fs_contigdirs, M_EXT2MNT); free(fs->e2fs, M_EXT2MNT); free(fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td) { int error; error = vflush(mp, 0, flags, td); return (error); } /* * Get filesystem statistics. */ int ext2_statfs(struct mount *mp, struct statfs *sbp) { struct ext2mount *ump; struct m_ext2fs *fs; uint32_t overhead, overhead_per_group, ngdb; int i, ngroups; ump = VFSTOEXT2(mp); fs = ump->um_e2fs; if (le16toh(fs->e2fs->e2fs_magic) != E2FS_MAGIC) panic("ext2_statfs"); /* * Compute the overhead (FS structures) */ overhead_per_group = 1 /* block bitmap */ + 1 /* inode bitmap */ + fs->e2fs_itpg; overhead = le32toh(fs->e2fs->e2fs_first_dblock) + fs->e2fs_gcount * overhead_per_group; if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 && le32toh(fs->e2fs->e2fs_features_rocompat) & EXT2F_ROCOMPAT_SPARSESUPER) { for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) { if (ext2_cg_has_sb(fs, i)) ngroups++; } } else { ngroups = fs->e2fs_gcount; } ngdb = fs->e2fs_gdbcount; if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 && le32toh(fs->e2fs->e2fs_features_compat) & EXT2F_COMPAT_RESIZE) ngdb += le16toh(fs->e2fs->e2fs_reserved_ngdb); overhead += ngroups * (1 /* superblock */ + ngdb); sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = fs->e2fs_bcount - overhead; sbp->f_bfree = fs->e2fs_fbcount; sbp->f_bavail = sbp->f_bfree - fs->e2fs_rbcount; sbp->f_files = le32toh(fs->e2fs->e2fs_icount); sbp->f_ffree = fs->e2fs_ficount; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(struct mount *mp, int waitfor) { struct vnode *mvp, *vp; struct thread *td; struct inode *ip; struct ext2mount *ump = VFSTOEXT2(mp); struct m_ext2fs *fs; int error, allerror = 0; td = curthread; fs = ump->um_e2fs; if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */ panic("ext2_sync: rofs mod fs=%s", fs->e2fs_fsmnt); } /* * Write back each (modified) inode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK); if (error) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } if ((error = VOP_FSYNC(vp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(vp); vrele(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp); } /* * Write back modified superblock. */ if (fs->e2fs_fmod != 0) { fs->e2fs_fmod = 0; fs->e2fs->e2fs_wtime = htole32(time_second); if ((error = ext2_cgupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { struct m_ext2fs *fs; struct inode *ip; struct ext2mount *ump; struct buf *bp; struct vnode *vp; struct thread *td; unsigned int i, used_blocks; int error; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); ump = VFSTOEXT2(mp); ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) { *vpp = NULL; free(ip, M_EXT2NODE); return (error); } vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_ump = ump; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { free(ip, M_EXT2NODE); *vpp = NULL; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ino)), ip); if (error) { brelse(bp); vput(vp); *vpp = NULL; return (error); } ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; /* * Now we want to make sure that block pointers for unused * blocks are zeroed out - ext2_balloc depends on this * although for regular files and directories only * * If IN_E4EXTENTS is enabled, unused blocks are not zeroed * out because we could corrupt the extent tree. */ if (!(ip->i_flag & IN_E4EXTENTS) && (S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode))) { used_blocks = howmany(ip->i_size, fs->e2fs_bsize); for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } #ifdef EXT2FS_PRINT_EXTENTS ext2_print_inode(ip); ext4_ext_print_extent_tree_status(ip); #endif bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct inode *ip; struct ufid *ufhp; struct vnode *nvp; struct m_ext2fs *fs; int error; ufhp = (struct ufid *)fhp; fs = VFSTOEXT2(mp)->um_e2fs; if (ufhp->ufid_ino < EXT2_ROOTINO || ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs_ipg) return (ESTALE); error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp); if (error) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, 0, curthread); return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct ext2fs *es = fs->e2fs; struct buf *bp; int error = 0; es->e2fs_bcount = htole32(fs->e2fs_bcount & 0xffffffff); es->e2fs_rbcount = htole32(fs->e2fs_rbcount & 0xffffffff); es->e2fs_fbcount = htole32(fs->e2fs_fbcount & 0xffffffff); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { es->e4fs_bcount_hi = htole32(fs->e2fs_bcount >> 32); es->e4fs_rbcount_hi = htole32(fs->e2fs_rbcount >> 32); es->e4fs_fbcount_hi = htole32(fs->e2fs_fbcount >> 32); } es->e2fs_ficount = htole32(fs->e2fs_ficount); if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) ext2_sb_csum_set(fs); bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2fs)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here. */ return (error); } int ext2_cgupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct buf *bp; int i, j, g_count = 0, error = 0, allerror = 0; allerror = ext2_sbupdate(mp, waitfor); /* Update gd csums */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) || EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) ext2_gd_csum_set(fs); for (i = 0; i < fs->e2fs_gdbcount; i++) { bp = getblk(mp->um_devvp, fsbtodb(fs, ext2_cg_location(fs, i)), fs->e2fs_bsize, 0, 0, 0); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { memcpy(bp->b_data, &fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], fs->e2fs_bsize); } else { for (j = 0; j < fs->e2fs_bsize / E2FS_REV0_GD_SIZE && g_count < fs->e2fs_gcount; j++, g_count++) memcpy(bp->b_data + j * E2FS_REV0_GD_SIZE, &fs->e2fs_gd[g_count], E2FS_REV0_GD_SIZE); } if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); } if (!allerror && error) allerror = error; return (allerror); } /* * Return the root of a filesystem. */ static int ext2_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *nvp; int error; error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp); if (error) return (error); *vpp = nvp; return (0); } diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index e52db1416890..d26044e11929 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -1,2354 +1,2355 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.7 (Berkeley) 2/3/94 * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 * $FreeBSD$ */ #include "opt_suiddir.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , vnops, trace, "int", "char*"); static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *); static void ext2_itimes_locked(struct vnode *); static vop_access_t ext2_access; static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ext2_close; static vop_create_t ext2_create; static vop_fsync_t ext2_fsync; static vop_getattr_t ext2_getattr; static vop_ioctl_t ext2_ioctl; static vop_link_t ext2_link; static vop_mkdir_t ext2_mkdir; static vop_mknod_t ext2_mknod; static vop_open_t ext2_open; static vop_pathconf_t ext2_pathconf; static vop_print_t ext2_print; static vop_read_t ext2_read; static vop_readlink_t ext2_readlink; static vop_remove_t ext2_remove; static vop_rename_t ext2_rename; static vop_rmdir_t ext2_rmdir; static vop_setattr_t ext2_setattr; static vop_strategy_t ext2_strategy; static vop_symlink_t ext2_symlink; static vop_write_t ext2_write; static vop_deleteextattr_t ext2_deleteextattr; static vop_getextattr_t ext2_getextattr; static vop_listextattr_t ext2_listextattr; static vop_setextattr_t ext2_setextattr; static vop_vptofh_t ext2_vptofh; static vop_close_t ext2fifo_close; static vop_kqfilter_t ext2fifo_kqfilter; /* Global vfs data structures for ext2. */ struct vop_vector ext2_vnodeops = { .vop_default = &default_vnodeops, .vop_access = ext2_access, .vop_bmap = ext2_bmap, .vop_cachedlookup = ext2_lookup, .vop_close = ext2_close, .vop_create = ext2_create, .vop_fsync = ext2_fsync, .vop_getpages = vnode_pager_local_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_ioctl = ext2_ioctl, .vop_link = ext2_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = ext2_mkdir, .vop_mknod = ext2_mknod, .vop_open = ext2_open, .vop_pathconf = ext2_pathconf, .vop_poll = vop_stdpoll, .vop_print = ext2_print, .vop_read = ext2_read, .vop_readdir = ext2_readdir, .vop_readlink = ext2_readlink, .vop_reallocblks = ext2_reallocblks, .vop_reclaim = ext2_reclaim, .vop_remove = ext2_remove, .vop_rename = ext2_rename, .vop_rmdir = ext2_rmdir, .vop_setattr = ext2_setattr, .vop_strategy = ext2_strategy, .vop_symlink = ext2_symlink, .vop_write = ext2_write, .vop_deleteextattr = ext2_deleteextattr, .vop_getextattr = ext2_getextattr, .vop_listextattr = ext2_listextattr, .vop_setextattr = ext2_setextattr, #ifdef UFS_ACL .vop_getacl = ext2_getacl, .vop_setacl = ext2_setacl, .vop_aclcheck = ext2_aclcheck, #endif /* UFS_ACL */ .vop_vptofh = ext2_vptofh, }; VFS_VOP_VECTOR_REGISTER(ext2_vnodeops); struct vop_vector ext2_fifoops = { .vop_default = &fifo_specops, .vop_access = ext2_access, .vop_close = ext2fifo_close, .vop_fsync = ext2_fsync, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_kqfilter = ext2fifo_kqfilter, .vop_pathconf = ext2_pathconf, .vop_print = ext2_print, .vop_read = VOP_PANIC, .vop_reclaim = ext2_reclaim, .vop_setattr = ext2_setattr, .vop_write = VOP_PANIC, .vop_vptofh = ext2_vptofh, }; VFS_VOP_VECTOR_REGISTER(ext2_fifoops); /* * A virgin directory (no blushing please). * Note that the type and namlen fields are reversed relative to ext2. * Also, we don't use `struct odirtemplate', since it would just cause * endianness problems. */ static struct dirtemplate mastertemplate = { 0, htole16(12), 1, EXT2_FT_DIR, ".", 0, htole16(DIRBLKSIZ - 12), 2, EXT2_FT_DIR, ".." }; static struct dirtemplate omastertemplate = { 0, htole16(12), 1, EXT2_FT_UNKNOWN, ".", 0, htole16(DIRBLKSIZ - 12), 2, EXT2_FT_UNKNOWN, ".." }; static void ext2_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { ip->i_atime = ts.tv_sec; ip->i_atimensec = ts.tv_nsec; } if (ip->i_flag & IN_UPDATE) { ip->i_mtime = ts.tv_sec; ip->i_mtimensec = ts.tv_nsec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { ip->i_ctime = ts.tv_sec; ip->i_ctimensec = ts.tv_nsec; } } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ext2_itimes(struct vnode *vp) { VI_LOCK(vp); ext2_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ext2_create(struct vop_create_args *ap) { int error; error = ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } static int ext2_open(struct vop_open_args *ap) { if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ static int ext2_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ext2_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) return (EOPNOTSUPP); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((accmode & VWRITE) && (ip->i_flags & (SF_IMMUTABLE | SF_SNAPSHOT))) return (EPERM); error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_accmode, ap->a_cred); return (error); } static int ext2_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; ext2_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_devvp->v_rdev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_atimensec : 0; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_mtimensec : 0; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_ctimensec : 0; if E2DI_HAS_XTIME(ip) { vap->va_birthtime.tv_sec = ip->i_birthtime; vap->va_birthtime.tv_nsec = ip->i_birthnsec; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ext2_setattr(struct vop_setattr_args *ap) { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { /* Disallow flags not supported by ext2fs. */ if (vap->va_flags & ~(SF_APPEND | SF_IMMUTABLE | UF_NODUMP)) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes and privileged processes in * jail() are not permitted to unset system flags, or * modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; ip->i_flag |= IN_CHANGE; if (ip->i_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, td)))) return (error); ip->i_flag |= IN_CHANGE | IN_MODIFIED; if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; ip->i_atime = vap->va_atime.tv_sec; ip->i_atimensec = vap->va_atime.tv_nsec; } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; ip->i_mtime = vap->va_mtime.tv_sec; ip->i_mtimensec = vap->va_mtime.tv_nsec; } ip->i_birthtime = vap->va_birthtime.tv_sec; ip->i_birthnsec = vap->va_birthtime.tv_nsec; error = ext2_update(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ext2_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ext2_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { error = priv_check_cred(cred, PRIV_VFS_STICKYFILE); if (error) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ext2_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file * to a group of which we are not a member, the caller must * have privilege. */ if (uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN); if (error) return (error); } ogid = ip->i_gid; ouid = ip->i_uid; ip->i_gid = gid; ip->i_uid = uid; ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID) != 0) ip->i_mode &= ~(ISUID | ISGID); } return (0); } /* * Synch an open file. */ /* ARGSUSED */ static int ext2_fsync(struct vop_fsync_args *ap) { /* * Flush all dirty buffers associated with a vnode. */ vop_stdfsync(ap); return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT)); } /* * Mknod vnode call */ /* ARGSUSED */ static int ext2_mknod(struct vop_mknod_args *ap) { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ if (!(ip->i_flag & IN_E4EXTENTS)) ip->i_rdev = vap->va_rdev; } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } static int ext2_remove(struct vop_remove_args *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ext2_dirremove(dvp, ap->a_cnp); if (error == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } /* * link vnode call */ static int ext2_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; int error; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_link: no name"); #endif ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= EXT4_LINK_MAX) { error = EMLINK; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_nlink++; ip->i_flag |= IN_CHANGE; error = ext2_update(vp, !DOINGASYNC(vp)); if (!error) error = ext2_direnter(ip, tdvp, cnp); if (error) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } static int ext2_inc_nlink(struct inode *ip) { ip->i_nlink++; if (S_ISDIR(ip->i_mode) && EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK) && ip->i_nlink > 1) { if (ip->i_nlink >= EXT4_LINK_MAX || ip->i_nlink == 2) ip->i_nlink = 1; } else if (ip->i_nlink > EXT4_LINK_MAX) { ip->i_nlink--; return (EMLINK); } return (0); } static void ext2_dec_nlink(struct inode *ip) { if (!S_ISDIR(ip->i_mode) || ip->i_nlink > 2) ip->i_nlink--; } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ext2_rename(struct vop_rename_args *ap) { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct inode *ip, *xp, *dp; struct dirtemplate *dirbuf; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0; u_char namlen; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ext2_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. Temporarily just warn if they do. */ if (fvp == tvp) { SDT_PROBE2(ext2fs, , vnops, trace, 1, "rename: fvp == tvp (can't happen)"); error = 0; goto abortit; } if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { VOP_UNLOCK(fvp); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory++; } vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ext2_inc_nlink(ip); ip->i_flag |= IN_CHANGE; if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) { VOP_UNLOCK(fvp); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); VOP_UNLOCK(fvp); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ext2_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { error = ext2_inc_nlink(dp); if (error) goto bad; dp->i_flag |= IN_CHANGE; error = ext2_update(tdvp, !DOINGASYNC(tdvp)); if (error) goto bad; } error = ext2_direnter(ip, tdvp, tcnp); if (error) { if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; (void)ext2_update(tdvp, 1); } goto bad; } vput(tdvp); } else { if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ext2_rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. */ if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != dp->i_uid && xp->i_uid != tcnp->cn_cred->cr_uid) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode & IFMT) == IFDIR) { if (!ext2_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ext2_dirrewrite(dp, ip, tcnp); if (error) goto bad; /* * If the target directory is in the same * directory as the source directory, * decrement the link count on the parent * of the target directory. */ if (doingdirectory && !newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } vput(tdvp); /* * Adjust the link count of the target to * reflect the dirrewrite above. If this is * a directory it is empty and there are * no links to it, so we can squash the inode and * any space associated with it. We disallowed * renaming over top of a directory with links to * it above, as the remaining link would point to * a directory without "." or ".." entries. */ ext2_dec_nlink(xp); if (doingdirectory) { if (xp->i_nlink > 2) panic("ext2_rename: linked directory"); error = ext2_truncate(tvp, (off_t)0, IO_SYNC, tcnp->cn_cred, tcnp->cn_thread); xp->i_nlink = 0; } xp->i_flag |= IN_CHANGE; vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. IN_RENAME is not sufficient * to protect against directory races due to timing windows, * so we can't panic here. */ vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; its link * count of three would cause a rmdir to fail with ENOTEMPTY. * The IN_RENAME flag ensures that it cannot be moved by another * rename. */ if (xp != ip) { /* * From name resolves to a different inode. IN_RENAME is * not sufficient protection against timing window races * so we can't panic here. */ } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; dirbuf = malloc(dp->i_e2fs->e2fs_bsize, M_TEMP, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); if (error == 0) { /* Like ufs little-endian: */ namlen = dirbuf->dotdot_type; if (namlen != 2 || dirbuf->dotdot_name[0] != '.' || dirbuf->dotdot_name[1] != '.') { ext2_dirbad(xp, (doff_t)12, "rename: mangled dir"); } else { dirbuf->dotdot_ino = htole32(newparent); /* * dirblock 0 could be htree root, * try both csum update functions. */ ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); ext2_dx_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); (void)vn_rdwr(UIO_WRITE, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); cache_purge(fdvp); } } free(dirbuf, M_TEMP); } error = ext2_dirremove(fdvp, fcnp); if (!error) { ext2_dec_nlink(xp); xp->i_flag |= IN_CHANGE; } xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { ext2_dec_nlink(ip); ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; vput(fvp); } else vrele(fvp); return (error); } #ifdef UFS_ACL static int ext2_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; *dacl = *acl; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ #ifdef DEBUG printf("ext2_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); #endif /* DEBUG */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ext2_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } #endif /* UFS_ACL */ /* * Mkdir system call */ static int ext2_mkdir(struct vop_mkdir_args *ap) { struct m_ext2fs *fs; struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct dirtemplate dirtemplate, *dtp; char *buf = NULL; int error, dmode; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ext2_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); fs = ip->i_e2fs; ip->i_gid = dp->i_gid; #ifdef SUIDDIR { /* * if we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_nlink = 2; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; error = ext2_update(tvp, 1); /* * Bump link count in parent directory * to reflect work done below. Should * be done before reference is created * so reparation is possible if we crash. */ ext2_inc_nlink(dp); dp->i_flag |= IN_CHANGE; error = ext2_update(dvp, !DOINGASYNC(dvp)); if (error) goto bad; /* Initialize directory with "." and ".." from static template. */ if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs, EXT2F_INCOMPAT_FTYPE)) dtp = &mastertemplate; else dtp = &omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = htole32(ip->i_number); dirtemplate.dotdot_ino = htole32(dp->i_number); /* * note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE so let's * just redefine it - for this function only */ #undef DIRBLKSIZ #define DIRBLKSIZ VTOI(dvp)->i_e2fs->e2fs_bsize dirtemplate.dotdot_reclen = htole16(DIRBLKSIZ - 12); buf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK | M_ZERO); if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { dirtemplate.dotdot_reclen = htole16(le16toh(dirtemplate.dotdot_reclen) - sizeof(struct ext2fs_direct_tail)); ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, DIRBLKSIZ)); } memcpy(buf, &dirtemplate, sizeof(dirtemplate)); ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)buf); error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)buf, DIRBLKSIZ, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED, NULL, NULL); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; goto bad; } if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) /* XXX should grow with balloc() */ panic("ext2_mkdir: blksize"); else { ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE; } #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* UFS_ACL */ /* Directory set up, now install its entry in the parent directory. */ error = ext2_direnter(ip, dvp, cnp); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } bad: /* * No need to do an explicit VOP_TRUNCATE here, vrele will do this * for us because we set the link count to 0. */ if (error) { ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); } else *ap->a_vpp = tvp; out: free(buf, M_TEMP); return (error); #undef DIRBLKSIZ #define DIRBLKSIZ DEV_BSIZE } /* * Rmdir system call. */ static int ext2_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ if (!ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ext2_dirremove(dvp, cnp); if (error) goto out; ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; cache_purge(dvp); VOP_UNLOCK(dvp); /* * Truncate inode. The only stuff left * in the directory is "." and "..". */ ip->i_nlink = 0; error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred, cnp->cn_thread); cache_purge(ITOV(ip)); if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(vp); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } out: return (error); } /* * symlink -- make a symbolic link */ static int ext2_symlink(struct vop_symlink_args *ap) { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); - if (len < vp->v_mount->mnt_maxsymlinklen) { + if (len < VFSTOEXT2(vp->v_mount)->um_e2fs->e2fs_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target), len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Return target name of a symbolic link */ static int ext2_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; - if (isize < vp->v_mount->mnt_maxsymlinklen) { + if (isize < VFSTOEXT2(vp->v_mount)->um_e2fs->e2fs_maxsymlinklen) { uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ext2_bmaparray() operation may not * deadlock on memory. See ext2_bmap() for details. */ static int ext2_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct bufobj *bo; daddr_t blkno; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ext2_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS) error = ext4_bmapext(vp, bp->b_lblkno, &blkno, NULL, NULL); else error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = VFSTOEXT2(vp->v_mount)->um_bo; BO_STRATEGY(bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ext2_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); vn_printf(ip->i_devvp, "\tino %ju", (uintmax_t)ip->i_number); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ext2fifo_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ext2 kqfilter routines if needed */ static int ext2fifo_kqfilter(struct vop_kqfilter_args *ap) { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = vfs_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to ext2 filesystems. */ static int ext2_pathconf(struct vop_pathconf_args *ap) { int error = 0; switch (ap->a_name) { case _PC_LINK_MAX: if (EXT2_HAS_RO_COMPAT_FEATURE(VTOI(ap->a_vp)->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) *ap->a_retval = INT_MAX; else *ap->a_retval = EXT4_LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_PATH_MAX: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; break; #endif /* UFS_ACL */ case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } /* * Vnode operation to remove a named attribute. */ static int ext2_deleteextattr(struct vop_deleteextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_delete(ip, ap->a_attrnamespace, ap->a_name); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_delete(ip, ap->a_attrnamespace, ap->a_name); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ext2_getextattr(struct vop_getextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ext2_listextattr(struct vop_listextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); if (error) return (error); } if (ip->i_facl) error = ext2_extattr_block_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to set a named attribute. */ static int ext2_setextattr(struct vop_setextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ext2_extattr_valid_attrname(ap->a_attrnamespace, ap->a_name); if (error) return (error); if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); if (error != ENOSPC) return (error); } error = ext2_extattr_block_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); return (error); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(struct vop_vptofh_args *ap) { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ext2_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp) { struct inode *ip; struct vnode *vp; vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); /* * Only unallocated inodes should be of type VNON. */ if (ip->i_mode != 0 && vp->v_type == VNON) return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == EXT2_ROOTINO) vp->v_vflag |= VV_ROOT; ip->i_modrev = init_va_filerev(); *vpp = vp; return (0); } /* * Allocate a new inode. */ static int ext2_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct inode *ip, *pdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp); if (error) { return (error); } ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { /* * if we are * not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_nlink = 1; if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) { if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~ISGID; } if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = ext2_update(tvp, !DOINGASYNC(tvp)); if (error) goto bad; #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* UFS_ACL */ error = ext2_direnter(ip, dvp, cnp); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); return (error); } /* * Vnode op for reading. */ static int ext2_read(struct vop_read_args *ap) { struct vnode *vp; struct inode *ip; struct uio *uio; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid, seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("%s: mode", "ext2_read"); if (vp->v_type == VLNK) { - if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) + if ((int)ip->i_size < + VFSTOEXT2(vp->v_mount)->um_e2fs->e2fs_maxsymlinklen) panic("%s: short symlink", "ext2_read"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("%s: type %d", "ext2_read", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0")); fs = ip->i_e2fs; if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->e2fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { u_int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error because the loop * above resets bp to NULL on each iteration and on normal * completion has not set a new value into it. so it must have come * from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) ip->i_flag |= IN_ACCESS; return (error); } static int ext2_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp; int error; vp = ap->a_vp; switch (ap->a_command) { case FIOSEEKDATA: if (!(VTOI(vp)->i_flag & IN_E4EXTENTS)) { error = vn_lock(vp, LK_SHARED); if (error == 0) { error = ext2_bmap_seekdata(vp, (off_t *)ap->a_data); VOP_UNLOCK(vp); } else error = EBADF; return (error); } case FIOSEEKHOLE: return (vn_bmap_seekhole(vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } /* * Vnode op for writing. */ static int ext2_write(struct vop_write_args *ap) { struct vnode *vp; struct uio *uio; struct inode *ip; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; seqcount = ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("%s: mode", "ext2_write"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: /* XXX differs from ffs -- this is called from ext2_mkdir(). */ if ((ioflag & IO_SYNC) == 0) panic("ext2_write: nonsync dir write"); break; default: panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp, vp->v_type, (intmax_t)uio->uio_offset, (intmax_t)uio->uio_resid); } KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0")); fs = ip->i_e2fs; if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->e2fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = ext2_balloc(ip, lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); if (error != 0) break; if ((ioflag & (IO_SYNC | IO_INVAL)) == (IO_SYNC | IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) ip->i_size = uio->uio_offset + xfersize; size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to ext2_balloc() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize == xfersize) vfs_bio_clrbuf(bp); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount, 0); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~(ISUID | ISGID); } if (error) { if (ioflag & IO_UNIT) { (void)ext2_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } if (uio->uio_resid != resid) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (ioflag & IO_SYNC) error = ext2_update(vp, 1); } return (error); } diff --git a/sys/fs/ext2fs/ext2fs.h b/sys/fs/ext2fs/ext2fs.h index 81ff6838f16f..f43ebcfebb92 100644 --- a/sys/fs/ext2fs/ext2fs.h +++ b/sys/fs/ext2fs/ext2fs.h @@ -1,432 +1,433 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science * * $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Aditya Sarawgi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * */ #ifndef _FS_EXT2FS_EXT2FS_H_ #define _FS_EXT2FS_EXT2FS_H_ #include /* * Super block for an ext2fs file system. */ struct ext2fs { uint32_t e2fs_icount; /* Inode count */ uint32_t e2fs_bcount; /* blocks count */ uint32_t e2fs_rbcount; /* reserved blocks count */ uint32_t e2fs_fbcount; /* free blocks count */ uint32_t e2fs_ficount; /* free inodes count */ uint32_t e2fs_first_dblock; /* first data block */ uint32_t e2fs_log_bsize; /* block size = 1024*(2^e2fs_log_bsize) */ uint32_t e2fs_log_fsize; /* fragment size */ uint32_t e2fs_bpg; /* blocks per group */ uint32_t e2fs_fpg; /* frags per group */ uint32_t e2fs_ipg; /* inodes per group */ uint32_t e2fs_mtime; /* mount time */ uint32_t e2fs_wtime; /* write time */ uint16_t e2fs_mnt_count; /* mount count */ uint16_t e2fs_max_mnt_count; /* max mount count */ uint16_t e2fs_magic; /* magic number */ uint16_t e2fs_state; /* file system state */ uint16_t e2fs_beh; /* behavior on errors */ uint16_t e2fs_minrev; /* minor revision level */ uint32_t e2fs_lastfsck; /* time of last fsck */ uint32_t e2fs_fsckintv; /* max time between fscks */ uint32_t e2fs_creator; /* creator OS */ uint32_t e2fs_rev; /* revision level */ uint16_t e2fs_ruid; /* default uid for reserved blocks */ uint16_t e2fs_rgid; /* default gid for reserved blocks */ /* EXT2_DYNAMIC_REV superblocks */ uint32_t e2fs_first_ino; /* first non-reserved inode */ uint16_t e2fs_inode_size; /* size of inode structure */ uint16_t e2fs_block_group_nr; /* block grp number of this sblk*/ uint32_t e2fs_features_compat; /* compatible feature set */ uint32_t e2fs_features_incompat; /* incompatible feature set */ uint32_t e2fs_features_rocompat; /* RO-compatible feature set */ uint8_t e2fs_uuid[16]; /* 128-bit uuid for volume */ char e2fs_vname[16]; /* volume name */ char e2fs_fsmnt[64]; /* name mounted on */ uint32_t e2fs_algo; /* For compression */ uint8_t e2fs_prealloc; /* # of blocks for old prealloc */ uint8_t e2fs_dir_prealloc; /* # of blocks for old prealloc dirs */ uint16_t e2fs_reserved_ngdb; /* # of reserved gd blocks for resize */ char e3fs_journal_uuid[16]; /* uuid of journal superblock */ uint32_t e3fs_journal_inum; /* inode number of journal file */ uint32_t e3fs_journal_dev; /* device number of journal file */ uint32_t e3fs_last_orphan; /* start of list of inodes to delete */ uint32_t e3fs_hash_seed[4]; /* HTREE hash seed */ char e3fs_def_hash_version;/* Default hash version to use */ char e3fs_jnl_backup_type; uint16_t e3fs_desc_size; /* size of group descriptor */ uint32_t e3fs_default_mount_opts; uint32_t e3fs_first_meta_bg; /* First metablock block group */ uint32_t e3fs_mkfs_time; /* when the fs was created */ uint32_t e3fs_jnl_blks[17]; /* backup of the journal inode */ uint32_t e4fs_bcount_hi; /* high bits of blocks count */ uint32_t e4fs_rbcount_hi; /* high bits of reserved blocks count */ uint32_t e4fs_fbcount_hi; /* high bits of free blocks count */ uint16_t e4fs_min_extra_isize; /* all inodes have some bytes */ uint16_t e4fs_want_extra_isize;/* inodes must reserve some bytes */ uint32_t e4fs_flags; /* miscellaneous flags */ uint16_t e4fs_raid_stride; /* RAID stride */ uint16_t e4fs_mmpintv; /* seconds to wait in MMP checking */ uint64_t e4fs_mmpblk; /* block for multi-mount protection */ uint32_t e4fs_raid_stripe_wid; /* blocks on data disks (N * stride) */ uint8_t e4fs_log_gpf; /* FLEX_BG group size */ uint8_t e4fs_chksum_type; /* metadata checksum algorithm used */ uint8_t e4fs_encrypt; /* versioning level for encryption */ uint8_t e4fs_reserved_pad; uint64_t e4fs_kbytes_written; /* number of lifetime kilobytes */ uint32_t e4fs_snapinum; /* inode number of active snapshot */ uint32_t e4fs_snapid; /* sequential ID of active snapshot */ uint64_t e4fs_snaprbcount; /* reserved blocks for active snapshot */ uint32_t e4fs_snaplist; /* inode number for on-disk snapshot */ uint32_t e4fs_errcount; /* number of file system errors */ uint32_t e4fs_first_errtime; /* first time an error happened */ uint32_t e4fs_first_errino; /* inode involved in first error */ uint64_t e4fs_first_errblk; /* block involved of first error */ uint8_t e4fs_first_errfunc[32];/* function where error happened */ uint32_t e4fs_first_errline; /* line number where error happened */ uint32_t e4fs_last_errtime; /* most recent time of an error */ uint32_t e4fs_last_errino; /* inode involved in last error */ uint32_t e4fs_last_errline; /* line number where error happened */ uint64_t e4fs_last_errblk; /* block involved of last error */ uint8_t e4fs_last_errfunc[32]; /* function where error happened */ uint8_t e4fs_mount_opts[64]; uint32_t e4fs_usrquota_inum; /* inode for tracking user quota */ uint32_t e4fs_grpquota_inum; /* inode for tracking group quota */ uint32_t e4fs_overhead_clusters;/* overhead blocks/clusters */ uint32_t e4fs_backup_bgs[2]; /* groups with sparse_super2 SBs */ uint8_t e4fs_encrypt_algos[4];/* encryption algorithms in use */ uint8_t e4fs_encrypt_pw_salt[16];/* salt used for string2key */ uint32_t e4fs_lpf_ino; /* location of the lost+found inode */ uint32_t e4fs_proj_quota_inum; /* inode for tracking project quota */ uint32_t e4fs_chksum_seed; /* checksum seed */ uint32_t e4fs_reserved[98]; /* padding to the end of the block */ uint32_t e4fs_sbchksum; /* superblock checksum */ }; /* * The path name on which the file system is mounted is maintained * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in * the super block for this name. */ #define MAXMNTLEN 512 /* * In-Memory Superblock */ struct m_ext2fs { struct ext2fs * e2fs; char e2fs_fsmnt[MAXMNTLEN];/* name mounted on */ char e2fs_ronly; /* mounted read-only flag */ char e2fs_fmod; /* super block modified flag */ uint64_t e2fs_bcount; /* blocks count */ uint64_t e2fs_rbcount; /* reserved blocks count */ uint64_t e2fs_fbcount; /* free blocks count */ uint32_t e2fs_ficount; /* free inodes count */ uint32_t e2fs_bsize; /* Block size */ uint32_t e2fs_bshift; /* calc of logical block no */ uint32_t e2fs_bpg; /* Number of blocks per group */ int64_t e2fs_qbmask; /* = s_blocksize -1 */ uint32_t e2fs_fsbtodb; /* Shift to get disk block */ uint32_t e2fs_ipg; /* Number of inodes per group */ uint32_t e2fs_ipb; /* Number of inodes per block */ uint32_t e2fs_itpg; /* Number of inode table per group */ uint32_t e2fs_fsize; /* Size of fragments per block */ uint32_t e2fs_fpb; /* Number of fragments per block */ uint32_t e2fs_fpg; /* Number of fragments per group */ uint32_t e2fs_gdbcount; /* Number of group descriptors */ uint32_t e2fs_gcount; /* Number of groups */ uint32_t e2fs_isize; /* Size of inode */ uint32_t e2fs_total_dir; /* Total number of directories */ uint8_t *e2fs_contigdirs; /* (u) # of contig. allocated dirs */ char e2fs_wasvalid; /* valid at mount time */ off_t e2fs_maxfilesize; struct ext2_gd *e2fs_gd; /* Group Descriptors */ int32_t e2fs_contigsumsize; /* size of cluster summary array */ int32_t *e2fs_maxcluster; /* max cluster in each cyl group */ struct csum *e2fs_clustersum; /* cluster summary in each cyl group */ int32_t e2fs_uhash; /* 3 if hash should be signed, 0 if not */ uint32_t e2fs_csum_seed; /* sb checksum seed */ + uint64_t e2fs_maxsymlinklen; /* max size of short symlink */ }; /* cluster summary information */ struct csum { int8_t cs_init; /* cluster summary has been initialized */ int32_t *cs_sum; /* cluster summary array */ }; /* * The second extended file system magic number */ #define E2FS_MAGIC 0xEF53 /* * Revision levels */ #define E2FS_REV0 0 /* The good old (original) format */ #define E2FS_REV1 1 /* V2 format w/ dynamic inode sizes */ #define E2FS_REV0_INODE_SIZE 128 /* * Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 /* * compatible/incompatible features */ #define EXT2F_COMPAT_PREALLOC 0x0001 #define EXT2F_COMPAT_IMAGIC_INODES 0x0002 #define EXT2F_COMPAT_HASJOURNAL 0x0004 #define EXT2F_COMPAT_EXT_ATTR 0x0008 #define EXT2F_COMPAT_RESIZE 0x0010 #define EXT2F_COMPAT_DIRHASHINDEX 0x0020 #define EXT2F_COMPAT_LAZY_BG 0x0040 #define EXT2F_COMPAT_EXCLUDE_BITMAP 0x0100 #define EXT2F_COMPAT_SPARSESUPER2 0x0200 #define EXT2F_ROCOMPAT_SPARSESUPER 0x0001 #define EXT2F_ROCOMPAT_LARGEFILE 0x0002 #define EXT2F_ROCOMPAT_BTREE_DIR 0x0004 #define EXT2F_ROCOMPAT_HUGE_FILE 0x0008 #define EXT2F_ROCOMPAT_GDT_CSUM 0x0010 #define EXT2F_ROCOMPAT_DIR_NLINK 0x0020 #define EXT2F_ROCOMPAT_EXTRA_ISIZE 0x0040 #define EXT2F_ROCOMPAT_HAS_SNAPSHOT 0x0080 #define EXT2F_ROCOMPAT_QUOTA 0x0100 #define EXT2F_ROCOMPAT_BIGALLOC 0x0200 #define EXT2F_ROCOMPAT_METADATA_CKSUM 0x0400 #define EXT2F_ROCOMPAT_REPLICA 0x0800 #define EXT2F_ROCOMPAT_READONLY 0x1000 #define EXT2F_ROCOMPAT_PROJECT 0x2000 #define EXT2F_INCOMPAT_COMP 0x0001 #define EXT2F_INCOMPAT_FTYPE 0x0002 #define EXT2F_INCOMPAT_RECOVER 0x0004 #define EXT2F_INCOMPAT_JOURNAL_DEV 0x0008 #define EXT2F_INCOMPAT_META_BG 0x0010 #define EXT2F_INCOMPAT_EXTENTS 0x0040 #define EXT2F_INCOMPAT_64BIT 0x0080 #define EXT2F_INCOMPAT_MMP 0x0100 #define EXT2F_INCOMPAT_FLEX_BG 0x0200 #define EXT2F_INCOMPAT_EA_INODE 0x0400 #define EXT2F_INCOMPAT_DIRDATA 0x1000 #define EXT2F_INCOMPAT_CSUM_SEED 0x2000 #define EXT2F_INCOMPAT_LARGEDIR 0x4000 #define EXT2F_INCOMPAT_INLINE_DATA 0x8000 #define EXT2F_INCOMPAT_ENCRYPT 0x10000 struct ext2_feature { int mask; const char *name; }; static const struct ext2_feature compat[] = { { EXT2F_COMPAT_PREALLOC, "dir_prealloc" }, { EXT2F_COMPAT_IMAGIC_INODES, "imagic_inodes" }, { EXT2F_COMPAT_HASJOURNAL, "has_journal" }, { EXT2F_COMPAT_EXT_ATTR, "ext_attr" }, { EXT2F_COMPAT_RESIZE, "resize_inode" }, { EXT2F_COMPAT_DIRHASHINDEX, "dir_index" }, { EXT2F_COMPAT_EXCLUDE_BITMAP, "snapshot_bitmap" }, { EXT2F_COMPAT_SPARSESUPER2, "sparse_super2" } }; static const struct ext2_feature ro_compat[] = { { EXT2F_ROCOMPAT_SPARSESUPER, "sparse_super" }, { EXT2F_ROCOMPAT_LARGEFILE, "large_file" }, { EXT2F_ROCOMPAT_BTREE_DIR, "btree_dir" }, { EXT2F_ROCOMPAT_HUGE_FILE, "huge_file" }, { EXT2F_ROCOMPAT_GDT_CSUM, "uninit_groups" }, { EXT2F_ROCOMPAT_DIR_NLINK, "dir_nlink" }, { EXT2F_ROCOMPAT_EXTRA_ISIZE, "extra_isize" }, { EXT2F_ROCOMPAT_HAS_SNAPSHOT, "snapshot" }, { EXT2F_ROCOMPAT_QUOTA, "quota" }, { EXT2F_ROCOMPAT_BIGALLOC, "bigalloc" }, { EXT2F_ROCOMPAT_METADATA_CKSUM, "metadata_csum" }, { EXT2F_ROCOMPAT_REPLICA, "replica" }, { EXT2F_ROCOMPAT_READONLY, "ro" }, { EXT2F_ROCOMPAT_PROJECT, "project" } }; static const struct ext2_feature incompat[] = { { EXT2F_INCOMPAT_COMP, "compression" }, { EXT2F_INCOMPAT_FTYPE, "filetype" }, { EXT2F_INCOMPAT_RECOVER, "needs_recovery" }, { EXT2F_INCOMPAT_JOURNAL_DEV, "journal_dev" }, { EXT2F_INCOMPAT_META_BG, "meta_bg" }, { EXT2F_INCOMPAT_EXTENTS, "extents" }, { EXT2F_INCOMPAT_64BIT, "64bit" }, { EXT2F_INCOMPAT_MMP, "mmp" }, { EXT2F_INCOMPAT_FLEX_BG, "flex_bg" }, { EXT2F_INCOMPAT_EA_INODE, "ea_inode" }, { EXT2F_INCOMPAT_DIRDATA, "dirdata" }, { EXT2F_INCOMPAT_CSUM_SEED, "metadata_csum_seed" }, { EXT2F_INCOMPAT_LARGEDIR, "large_dir" }, { EXT2F_INCOMPAT_INLINE_DATA, "inline_data" }, { EXT2F_INCOMPAT_ENCRYPT, "encrypt" } }; /* * Features supported in this implementation * * We support the following REV1 features: * - EXT2F_ROCOMPAT_SPARSESUPER * - EXT2F_ROCOMPAT_LARGEFILE * - EXT2F_ROCOMPAT_EXTRA_ISIZE * - EXT2F_INCOMPAT_FTYPE * * We partially (read-only) support the following EXT4 features: * - EXT2F_ROCOMPAT_HUGE_FILE * - EXT2F_INCOMPAT_EXTENTS * */ #define EXT2F_COMPAT_SUPP EXT2F_COMPAT_DIRHASHINDEX #define EXT2F_ROCOMPAT_SUPP (EXT2F_ROCOMPAT_SPARSESUPER | \ EXT2F_ROCOMPAT_LARGEFILE | \ EXT2F_ROCOMPAT_GDT_CSUM | \ EXT2F_ROCOMPAT_METADATA_CKSUM | \ EXT2F_ROCOMPAT_DIR_NLINK | \ EXT2F_ROCOMPAT_HUGE_FILE | \ EXT2F_ROCOMPAT_EXTRA_ISIZE) #define EXT2F_INCOMPAT_SUPP (EXT2F_INCOMPAT_FTYPE | \ EXT2F_INCOMPAT_META_BG | \ EXT2F_INCOMPAT_EXTENTS | \ EXT2F_INCOMPAT_64BIT | \ EXT2F_INCOMPAT_FLEX_BG | \ EXT2F_INCOMPAT_CSUM_SEED) /* Assume that user mode programs are passing in an ext2fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test * macros from user land. */ #define EXT2_SB(sb) (sb) /* * Feature set definitions */ #define EXT2_HAS_COMPAT_FEATURE(sb,mask) \ ( le32toh(EXT2_SB(sb)->e2fs->e2fs_features_compat) & mask) #define EXT2_HAS_RO_COMPAT_FEATURE(sb,mask) \ ( le32toh(EXT2_SB(sb)->e2fs->e2fs_features_rocompat) & mask) #define EXT2_HAS_INCOMPAT_FEATURE(sb,mask) \ ( le32toh(EXT2_SB(sb)->e2fs->e2fs_features_incompat) & mask) /* * File clean flags */ #define E2FS_ISCLEAN 0x0001 /* Unmounted cleanly */ #define E2FS_ERRORS 0x0002 /* Errors detected */ /* * Filesystem miscellaneous flags */ #define E2FS_SIGNED_HASH 0x0001 #define E2FS_UNSIGNED_HASH 0x0002 #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT2_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT2_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ /* ext2 file system block group descriptor */ struct ext2_gd { uint32_t ext2bgd_b_bitmap; /* blocks bitmap block */ uint32_t ext2bgd_i_bitmap; /* inodes bitmap block */ uint32_t ext2bgd_i_tables; /* inodes table block */ uint16_t ext2bgd_nbfree; /* number of free blocks */ uint16_t ext2bgd_nifree; /* number of free inodes */ uint16_t ext2bgd_ndirs; /* number of directories */ uint16_t ext4bgd_flags; /* block group flags */ uint32_t ext4bgd_x_bitmap; /* snapshot exclusion bitmap loc. */ uint16_t ext4bgd_b_bmap_csum; /* block bitmap checksum */ uint16_t ext4bgd_i_bmap_csum; /* inode bitmap checksum */ uint16_t ext4bgd_i_unused; /* unused inode count */ uint16_t ext4bgd_csum; /* group descriptor checksum */ uint32_t ext4bgd_b_bitmap_hi; /* high bits of blocks bitmap block */ uint32_t ext4bgd_i_bitmap_hi; /* high bits of inodes bitmap block */ uint32_t ext4bgd_i_tables_hi; /* high bits of inodes table block */ uint16_t ext4bgd_nbfree_hi; /* high bits of number of free blocks */ uint16_t ext4bgd_nifree_hi; /* high bits of number of free inodes */ uint16_t ext4bgd_ndirs_hi; /* high bits of number of directories */ uint16_t ext4bgd_i_unused_hi; /* high bits of unused inode count */ uint32_t ext4bgd_x_bitmap_hi; /* high bits of snapshot exclusion */ uint16_t ext4bgd_b_bmap_csum_hi;/* high bits of block bitmap checksum */ uint16_t ext4bgd_i_bmap_csum_hi;/* high bits of inode bitmap checksum */ uint32_t ext4bgd_reserved; }; #define E2FS_REV0_GD_SIZE (sizeof(struct ext2_gd) / 2) #define E2FS_64BIT_GD_SIZE (sizeof(struct ext2_gd)) /* * Macro-instructions used to manage several block sizes */ #define EXT2_MIN_BLOCK_LOG_SIZE 10 #define EXT2_BLOCK_SIZE(s) ((s)->e2fs_bsize) #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof(uint32_t)) #define EXT2_INODE_SIZE(s) (EXT2_SB(s)->e2fs_isize) /* * Macro-instructions used to manage fragments */ #define EXT2_MIN_FRAG_SIZE 1024 #define EXT2_MIN_FRAG_LOG_SIZE 10 #define EXT2_MAX_FRAG_LOG_SIZE 30 #define EXT2_FRAG_SIZE(s) (EXT2_SB(s)->e2fs_fsize) #define EXT2_FRAGS_PER_BLOCK(s) (EXT2_SB(s)->e2fs_fpb) /* * Macro-instructions used to manage group descriptors */ #define EXT2_BLOCKS_PER_GROUP(s) (EXT2_SB(s)->e2fs_bpg) #define EXT2_DESCS_PER_BLOCK(s) (EXT2_HAS_INCOMPAT_FEATURE((s), \ EXT2F_INCOMPAT_64BIT) ? ((s)->e2fs_bsize / sizeof(struct ext2_gd)) : \ ((s)->e2fs_bsize / E2FS_REV0_GD_SIZE)) /* * Macro-instructions used to manage inodes */ #define EXT2_FIRST_INO(s) (le32toh((EXT2_SB(s)->e2fs->e2fs_rev) == \ E2FS_REV0) ? EXT2_FIRSTINO : le32toh(EXT2_SB(s)->e2fs->e2fs_first_ino)) #endif /* !_FS_EXT2FS_EXT2FS_H_ */ diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index ba02d0600a6e..5de58f979e97 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,6971 +1,6969 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void vn_seqc_init(struct vnode *); static void vn_seqc_write_end_free(struct vnode *vp); static void vgonel(struct vnode *); static bool vhold_recycle_free(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_lock(void *arg, int what); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); /* * These fences are intended for cases where some synchronization is * needed between access of v_iflags and lockless vnode refcount (v_holdcnt * and v_usecount) updates. Access to v_iflags is generally synchronized * by the interlock, but we have some internal assertions that check vnode * flags without acquiring the lock. Thus, these fences are INVARIANTS-only * for now. */ #ifdef INVARIANTS #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() #else #define VNODE_REFCOUNT_FENCE_ACQ() #define VNODE_REFCOUNT_FENCE_REL() #endif /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); static counter_u64_t recycles_free_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t deferred_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: NDFREE(&nd, 0); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ static int vnsz2log; /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; /* * Check vhold_recycle_free for an explanation. */ vp->v_holdcnt = VHOLD_NO_SMR; vp->v_type = VNON; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; int cpu, physvnodes, virtvnodes; u_int i; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); uma_zone_set_smr(vnode_zone, vfs_smr); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); recycles_free_count = counter_u64_alloc(M_WAITOK); deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { struct mount_pcpu *mpcpu; MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mpcpu, ref, 1); vfs_mp_count_add_pcpu(mpcpu, lockref, 1); vfs_op_thread_exit(mp, mpcpu); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { struct mount_pcpu *mpcpu; int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { goto next_iter; } /* * Handle races against vnode allocation. Filesystems lock the * vnode some time after it gets returned from getnewvnode, * despite type and hold count being manipulated earlier. * Resorting to checking v_mount restores guarantees present * before the global list was reworked to contain all vnodes. */ if (!VI_TRYLOCK(vp)) goto next_iter; if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); goto next_iter; } if (vp->v_mount == NULL) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); goto next_iter_unlocked; } counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); done++; next_iter_unlocked: if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) { struct vnode *vp; struct mount *mp; int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; vp = mvp; for (;;) { if (count == 0) { break; } vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); break; } if (__predict_false(vp->v_type == VMARKER)) continue; if (vp->v_holdcnt > 0) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. */ if (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) { continue; } if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { continue; } if (!vhold_recycle_free(vp)) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vtryrecycle(vp) == 0) count--; mtx_lock(&vnode_list_mtx); vp = mvp; } return (ocount - count); } static int vnlru_free_locked(int count) { mtx_assert(&vnode_list_mtx, MA_OWNED); return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); } void vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) { MPASS(mnt_op != NULL); MPASS(mvp != NULL); VNPASS(mvp->v_type == VMARKER, mvp); mtx_lock(&vnode_list_mtx); vnlru_free_impl(count, mnt_op, mvp); mtx_unlock(&vnode_list_mtx); } /* * Temporary binary compat, don't use. Call vnlru_free_vfsops instead. */ void vnlru_free(int count, struct vfsops *mnt_op) { struct vnode *mvp; if (count == 0) return; mtx_lock(&vnode_list_mtx); mvp = vnode_list_free_marker; if (vnlru_free_impl(count, mnt_op, mvp) == 0) { /* * It is possible the marker was moved over eligible vnodes by * callers which filtered by different ops. If so, start from * scratch. */ if (vnlru_read_freevnodes() > 0) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); } vnlru_free_impl(count, mnt_op, mvp); } mtx_unlock(&vnode_list_mtx); } struct vnode * vnlru_alloc_marker(void) { struct vnode *mvp; mvp = vn_alloc_marker(NULL); mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (mvp); } void vnlru_free_marker(struct vnode *mvp) { mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); vn_free_marker(mvp); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; /* * The main freevnodes counter is only updated when threads requeue their vnode * batches. CPUs are conditionally walked to compute a more accurate total. * * Limit how much of a slop are we willing to tolerate. Note: the actual value * at any given moment can still exceed slop, but it should not be by significant * margin in practice. */ #define VNLRU_FREEVNODES_SLOP 128 static __inline void vfs_freevnodes_inc(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes++; critical_exit(); } static __inline void vfs_freevnodes_dec(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes--; critical_exit(); } static u_long vnlru_read_freevnodes(void) { struct vdbatch *vd; long slop; int cpu; mtx_assert(&vnode_list_mtx, MA_OWNED); if (freevnodes > freevnodes_old) slop = freevnodes - freevnodes_old; else slop = freevnodes_old - freevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (freevnodes >= 0 ? freevnodes : 0); freevnodes_old = freevnodes; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); freevnodes_old += vd->freevnodes; } return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static bool vnlru_under_unlocked(u_long rnumvnodes, u_long limit) { long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { vnlru_free_locked(rnumvnodes - desiredvnodes); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); vdrop(vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); vdrop(vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { counter_u64_add(recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static struct vnode * __noinline vn_alloc_hard(struct mount *mp) { u_long rnumvnodes, rfreevnodes; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (rnumvnodes + 1 < desiredvnodes) { vn_alloc_cyclecount = 0; goto alloc; } rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked(1) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked(1); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; vp->v_irflag = 0; v_init_counters(vp); vn_seqc_init(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void __noinline freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_free(vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { destroy_vpollinfo(vp->v_pollinfo); vp->v_pollinfo = NULL; } vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); do { error = BO_SYNC(bo, MNT_WAIT); } while (error == ERELOOKUP); if (error != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int vget(struct vnode *vp, int flags) { enum vgetstate vs; vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); #else refcount_acquire(&vp->v_usecount); #endif } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } static void vunlazy(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode without hold count", __func__)); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { if (func == VUNREF) { VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, ("recursive vunref")); vp->v_vflag |= VV_UNREF; } for (;;) { error = vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); if (error != ERELOOKUP || !want_unlock) break; VOP_LOCK(vp, LK_EXCLUSIVE); } if (func == VUNREF) vp->v_vflag &= ~VV_UNREF; vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old == 0) vfs_freevnodes_dec(); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { if (count == 0) vfs_freevnodes_dec(); return (true); } } } /* * Hold a free vnode for recycling. * * Note: vnode_init references this comment. * * Attempts to recycle only need the global vnode list lock and have no use for * SMR. * * However, vnodes get inserted into the global list before they get fully * initialized and stay there until UMA decides to free the memory. This in * particular means the target can be found before it becomes usable and after * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to * VHOLD_NO_SMR. * * Note: the vnode may gain more references after we transition the count 0->1. */ static bool vhold_recycle_free(struct vnode *vp) { int count; mtx_assert(&vnode_list_mtx, MA_OWNED); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (count > 0) { return (false); } if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { vfs_freevnodes_dec(); return (true); } } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); critical_enter(); freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); return; } sched_pin(); vd = DPCPU_PTR(vd); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, ("%s: called for a used vnode\n", __func__)); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void __noinline vdropl_final(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(VN_IS_DOOMED(vp), vp); /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { vfs_freevnodes_inc(); VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } /* * Don't bump freevnodes as this one is going away. */ freevnode(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); if (VN_IS_DOOMED(vp)) { vdropl_final(vp); return; } vfs_freevnodes_inc(); if (vp->v_mflag & VMP_LAZYLIST) { vunlazy(vp); } /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ vdbatch_enqueue(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static int vinactivef(struct vnode *vp) { struct vm_object *obj; int error; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } error = VOP_INACTIVE(vp); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; return (error); } int vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return (0); if (vp->v_iflag & VI_DOINGINACT) return (0); if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return (0); } return (vinactivef(vp)); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } do { error = VOP_FSYNC(vp, MNT_WAIT, td); } while (error == ERELOOKUP); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } static void notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; mp = vp->v_mount; if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_uppers)) return; mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); mmp->mnt_op = &vgonel_vfsops; mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_VGONE_UPPER; for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } free(mmp, M_TEMP); mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, doinginact, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (VN_IS_DOOMED(vp)) return; /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vn_irflag_set_locked(vp, VIRF_DOOMED); /* * Check to see if the vnode is in use. If so, we have to * call VOP_CLOSE() and VOP_INACTIVE(). * * It could be that VOP_INACTIVE() requested reclamation, in * which case we should avoid recursion, so check * VI_DOINGINACT. This is not precise but good enough. */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } cache_purge_vgone(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (!doinginact) { do { if (oweinact || active) { VI_LOCK(vp); vinactivef(vp); oweinact = (vp->v_iflag & VI_OWEINACT) != 0; VI_UNLOCK(vp); } } while (oweinact); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; /* * Delete from old mount point vnode list. */ delmntque(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; } /* * Print out a description of a vnode. */ static const char * const typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; short irflag; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; irflag = vn_irflag_read(vp); if (irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); if (irflag & VIRF_PGREAD) strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); if (irflag & VIRF_MOUNTPOINT) strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_TEXT_REF) strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); if (vp->v_iflag & VI_FOPENING) strlcat(buf, "|VI_FOPENING", sizeof(buf)); flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT | VI_FOPENING); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); printf("\n"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_VGONE_UPPER); MNT_KERN_FLAG(MNTK_VGONE_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); MNT_KERN_FLAG(MNTK_NOKNOTE); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); - db_printf(" mnt_maxsymlinklen = %jd\n", - (uintmax_t)mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; int lkflags, objflags; bool seen_defer; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) { lkflags |= LK_NOWAIT; objflags = OBJPC_NOSYNC; } else { objflags = OBJPC_SYNC; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } if (vget(vp, lkflags) == 0) { obj = vp->v_object; if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, objflags); VM_OBJECT_WUNLOCK(obj); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); free(vi, M_VNODEPOLL); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_lock); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)); } /* * Check if vnode represents a disk device */ bool vn_isdisk_error(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: *errp = error; return (error == 0); } bool vn_isdisk(struct vnode *vp) { int error; return (vn_isdisk_error(vp, &error)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { int error; VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); goto out_error; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); goto out_error; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); out_error: /* * Permission check failed, but it is possible denial will get overwritten * (e.g., when root is traversing through a 700 directory owned by someone * else). * * vaccess() calls priv_check_cred which in turn can descent into MAC * modules overriding this result. It's quite unclear what semantics * are allowed for them to operate, thus for safety we don't call them * from within the SMR section. This also means if any such modules * are present, we have to let the regular lookup decide. */ error = priv_check_cred_vfs_lookup_nomac(cred); switch (error) { case 0: return (0); case EAGAIN: /* * MAC modules present. */ return (EAGAIN); case EPERM: return (EACCES); default: return (error); } } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to suppress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (!IGNORE_LOCK(vp)) { locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } #ifdef DEBUG_VFS_LOCKS void vop_mkdir_debugpost(void *ap, int rc) { struct vop_mkdir_args *a; a = ap; if (!rc) cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); } #endif void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_read_pgcache_post(void *ap, int rc) { struct vop_read_pgcache_args *a = ap; if (!rc) VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_lock(void *arg, int what) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; if (what == LA_LOCKED) ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); else ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } /* * Returns whether the directory is empty or not. * If it is empty, the return value is 0; otherwise * the return value is an error value (which may * be ENOTEMPTY). */ int vfs_emptydir(struct vnode *vp) { struct uio uio; struct iovec iov; struct dirent *dirent, *dp, *endp; int error, eof; error = 0; eof = 0; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); iov.iov_base = dirent; iov.iov_len = sizeof(struct dirent); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct dirent); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; while (eof == 0 && error == 0) { error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, NULL, NULL); if (error != 0) break; endp = (void *)((uint8_t *)dirent + sizeof(struct dirent) - uio.uio_resid); for (dp = dirent; dp < endp; dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { if (dp->d_type == DT_WHT) continue; if (dp->d_namlen == 0) continue; if (dp->d_type != DT_DIR && dp->d_type != DT_UNKNOWN) { error = ENOTEMPTY; break; } if (dp->d_namlen > 2) { error = ENOTEMPTY; break; } if (dp->d_namlen == 1 && dp->d_name[0] != '.') { error = ENOTEMPTY; break; } if (dp->d_namlen == 2 && dp->d_name[1] != '.') { error = ENOTEMPTY; break; } uio.uio_resid = sizeof(struct dirent); } } free(dirent, M_TEMP); return (error); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct mount_pcpu *mpcpu; struct vnode *vp; int error; if (!vfs_op_thread_enter(mp, mpcpu)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp, mpcpu); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp, mpcpu); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_unheld_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); vn_seqc_write_begin_unheld_locked(vp); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_begin_unheld(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_unheld_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } /* * Special case handling for allocating and freeing vnodes. * * The counter remains unchanged on free so that a doomed vnode will * keep testing as in modify as long as it is accessible with SMR. */ static void vn_seqc_init(struct vnode *vp) { vp->v_seqc = 0; vp->v_seqc_users = 0; } static void vn_seqc_write_end_free(struct vnode *vp) { VNPASS(seqc_in_modify(vp->v_seqc), vp); VNPASS(vp->v_seqc_users == 1, vp); } void vn_irflag_set_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & toset) == 0, vp, ("%s: some of the passed flags already set (have %d, passed %d)\n", __func__, flags, toset)); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_set_cond_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set_cond(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_cond_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_unset_locked(struct vnode *vp, short tounset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & tounset) == tounset, vp, ("%s: some of the passed flags not set (have %d, passed %d)\n", __func__, flags, tounset)); atomic_store_short(&vp->v_irflag, flags & ~tounset); } void vn_irflag_unset(struct vnode *vp, short tounset) { VI_LOCK(vp); vn_irflag_unset_locked(vp, tounset); VI_UNLOCK(vp); } diff --git a/sys/sys/mount.h b/sys/sys/mount.h index ae12c9fc8197..cd0d0071aa1b 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -1,1160 +1,1160 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 * $FreeBSD$ */ #ifndef _SYS_MOUNT_H_ #define _SYS_MOUNT_H_ #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* * NOTE: When changing statfs structure, mount structure, MNT_* flags or * MNTK_* flags also update DDB show mount command in vfs_subr.c. */ typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ #define fsidcmp(a, b) memcmp((a), (b), sizeof(fsid_t)) /* * File identifier. * These are unique per filesystem on a single machine. * * Note that the offset of fid_data is 4 bytes, so care must be taken to avoid * undefined behavior accessing unaligned fields within an embedded struct. */ #define MAXFIDSZ 16 struct fid { u_short fid_len; /* length of data in bytes */ u_short fid_data0; /* force longword alignment */ char fid_data[MAXFIDSZ]; /* data (variable length) */ }; /* * filesystem statistics */ #define MFSNAMELEN 16 /* length of type name including null */ #define MNAMELEN 1024 /* size of on/from name bufs */ #define STATFS_VERSION 0x20140518 /* current version number */ struct statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[MFSNAMELEN]; /* filesystem type name */ char f_mntfromname[MNAMELEN]; /* mounted filesystem */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ }; #if defined(_WANT_FREEBSD11_STATFS) || defined(_KERNEL) #define FREEBSD11_STATFS_VERSION 0x20030518 /* current version number */ struct freebsd11_statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[16]; /* filesystem type name */ char f_mntfromname[88]; /* mounted filesystem */ char f_mntonname[88]; /* directory on which mounted */ }; #endif /* _WANT_FREEBSD11_STATFS || _KERNEL */ #ifdef _KERNEL #define OMFSNAMELEN 16 /* length of fs type name, including null */ #define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ /* XXX getfsstat.2 is out of date with write and read counter changes here. */ /* XXX statfs.2 is out of date with read counter changes here. */ struct ostatfs { long f_spare2; /* placeholder */ long f_bsize; /* fundamental filesystem block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in filesystem */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in filesystem */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* filesystem id */ uid_t f_owner; /* user that mounted the filesystem */ int f_type; /* type of filesystem */ int f_flags; /* copy of mount exported flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ char f_fstypename[OMFSNAMELEN]; /* fs type name */ char f_mntonname[OMNAMELEN]; /* directory on which mounted */ long f_syncreads; /* count of sync reads since mount */ long f_asyncreads; /* count of async reads since mount */ short f_spares1; /* unused spare */ char f_mntfromname[OMNAMELEN];/* mounted filesystem */ short f_spares2; /* unused spare */ /* * XXX on machines where longs are aligned to 8-byte boundaries, there * is an unnamed int32_t here. This spare was after the apparent end * of the struct until we bit off the read counters from f_mntonname. */ long f_spare[2]; /* unused spare */ }; TAILQ_HEAD(vnodelst, vnode); /* Mount options list */ TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; int pos; int seen; }; struct mount_pcpu { int mntp_thread_in_ops; int mntp_ref; int mntp_lockref; int mntp_writeopcount; }; _Static_assert(sizeof(struct mount_pcpu) == 16, "the struct is allocated from pcpu 16 zone"); /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are * put on a doubly linked list. * * Lock reference: * l - mnt_listmtx * m - mountlist_mtx * i - interlock * i* - interlock of uppers' list head * v - vnode freelist mutex * * Unmarked fields are considered stable as long as a ref is held. * */ struct mount { int mnt_vfs_ops; /* (i) pending vfs ops */ int mnt_kern_flag; /* (i) kernel only flags */ uint64_t mnt_flag; /* (i) flags shared with user */ struct mount_pcpu *mnt_pcpu; /* per-CPU data */ struct vnode *mnt_rootvnode; struct vnode *mnt_vnodecovered; /* vnode we mounted on */ struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ struct mtx __aligned(CACHE_LINE_SIZE) mnt_mtx; /* mount structure interlock */ int mnt_gen; /* struct mount generation */ #define mnt_startzero mnt_list TAILQ_ENTRY(mount) mnt_list; /* (m) mount list */ struct vnode *mnt_syncer; /* syncer vnode */ int mnt_ref; /* (i) Reference count */ struct vnodelst mnt_nvnodelist; /* (i) list of vnodes */ int mnt_nvnodelistsize; /* (i) # of vnodes */ int mnt_writeopcount; /* (i) write syscalls pending */ struct vfsoptlist *mnt_opt; /* current mount options */ struct vfsoptlist *mnt_optnew; /* new options passed to fs */ - uint64_t mnt_maxsymlinklen; /* max size of short symlink */ + u_int mnt_pad0; /* was mnt_maxsymlinklen */ struct statfs mnt_stat; /* cache of filesystem stats */ struct ucred *mnt_cred; /* credentials of mounter */ void * mnt_data; /* private data */ time_t mnt_time; /* last time written*/ int mnt_iosize_max; /* max size for clusters, etc */ struct netexport *mnt_export; /* export list */ struct label *mnt_label; /* MAC label for the fs */ u_int mnt_hashseed; /* Random seed for vfs_hash */ int mnt_lockref; /* (i) Lock reference count */ int mnt_secondary_writes; /* (i) # of secondary writes */ int mnt_secondary_accwrites;/* (i) secondary wr. starts */ struct thread *mnt_susp_owner; /* (i) thread owning suspension */ #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ struct mtx mnt_listmtx; struct vnodelst mnt_lazyvnodelist; /* (l) list of lazy vnodes */ int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (i*) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (i) upper mounts over us */ }; /* * Definitions for MNT_VNODE_FOREACH_ALL. */ struct vnode *__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp); struct vnode *__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp); void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp); #define MNT_VNODE_FOREACH_ALL(vp, mp, mvp) \ for (vp = __mnt_vnode_first_all(&(mvp), (mp)); \ (vp) != NULL; vp = __mnt_vnode_next_all(&(mvp), (mp))) #define MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp) \ do { \ MNT_ILOCK(mp); \ __mnt_vnode_markerfree_all(&(mvp), (mp)); \ /* MNT_IUNLOCK(mp); -- done in above function */ \ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); \ } while (0) /* * Definitions for MNT_VNODE_FOREACH_LAZY. */ typedef int mnt_lazy_cb_t(struct vnode *, void *); struct vnode *__mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg); struct vnode *__mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg); void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp); #define MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, cb, cbarg) \ for (vp = __mnt_vnode_first_lazy(&(mvp), (mp), (cb), (cbarg)); \ (vp) != NULL; \ vp = __mnt_vnode_next_lazy(&(mvp), (mp), (cb), (cbarg))) #define MNT_VNODE_FOREACH_LAZY_ABORT(mp, mvp) \ __mnt_vnode_markerfree_lazy(&(mvp), (mp)) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) #define MNT_REF(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ mp->mnt_ref++; \ } while (0) #define MNT_REL(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ (mp)->mnt_ref--; \ if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0) \ vfs_dump_mount_counters(mp); \ if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops) \ wakeup((mp)); \ } while (0) #endif /* _KERNEL */ #if defined(_WANT_MNTOPTNAMES) || defined(_KERNEL) struct mntoptnames { uint64_t o_opt; const char *o_name; }; #define MNTOPT_NAMES \ { MNT_ASYNC, "asynchronous" }, \ { MNT_EXPORTED, "NFS exported" }, \ { MNT_LOCAL, "local" }, \ { MNT_NOATIME, "noatime" }, \ { MNT_NOEXEC, "noexec" }, \ { MNT_NOSUID, "nosuid" }, \ { MNT_NOSYMFOLLOW, "nosymfollow" }, \ { MNT_QUOTA, "with quotas" }, \ { MNT_RDONLY, "read-only" }, \ { MNT_SYNCHRONOUS, "synchronous" }, \ { MNT_UNION, "union" }, \ { MNT_NOCLUSTERR, "noclusterr" }, \ { MNT_NOCLUSTERW, "noclusterw" }, \ { MNT_SUIDDIR, "suiddir" }, \ { MNT_SOFTDEP, "soft-updates" }, \ { MNT_SUJ, "journaled soft-updates" }, \ { MNT_MULTILABEL, "multilabel" }, \ { MNT_ACLS, "acls" }, \ { MNT_NFS4ACLS, "nfsv4acls" }, \ { MNT_GJOURNAL, "gjournal" }, \ { MNT_AUTOMOUNTED, "automounted" }, \ { MNT_VERIFIED, "verified" }, \ { MNT_UNTRUSTED, "untrusted" }, \ { MNT_NOCOVER, "nocover" }, \ { MNT_EMPTYDIR, "emptydir" }, \ { MNT_UPDATE, "update" }, \ { MNT_DELEXPORT, "delexport" }, \ { MNT_RELOAD, "reload" }, \ { MNT_FORCE, "force" }, \ { MNT_SNAPSHOT, "snapshot" }, \ { 0, NULL } #endif /* * User specifiable flags, stored in mnt_flag. */ #define MNT_RDONLY 0x0000000000000001ULL /* read only filesystem */ #define MNT_SYNCHRONOUS 0x0000000000000002ULL /* fs written synchronously */ #define MNT_NOEXEC 0x0000000000000004ULL /* can't exec from filesystem */ #define MNT_NOSUID 0x0000000000000008ULL /* don't honor setuid fs bits */ #define MNT_NFS4ACLS 0x0000000000000010ULL /* enable NFS version 4 ACLs */ #define MNT_UNION 0x0000000000000020ULL /* union with underlying fs */ #define MNT_ASYNC 0x0000000000000040ULL /* fs written asynchronously */ #define MNT_SUIDDIR 0x0000000000100000ULL /* special SUID dir handling */ #define MNT_SOFTDEP 0x0000000000200000ULL /* using soft updates */ #define MNT_NOSYMFOLLOW 0x0000000000400000ULL /* do not follow symlinks */ #define MNT_GJOURNAL 0x0000000002000000ULL /* GEOM journal support enabled */ #define MNT_MULTILABEL 0x0000000004000000ULL /* MAC support for objects */ #define MNT_ACLS 0x0000000008000000ULL /* ACL support enabled */ #define MNT_NOATIME 0x0000000010000000ULL /* dont update file access time */ #define MNT_NOCLUSTERR 0x0000000040000000ULL /* disable cluster read */ #define MNT_NOCLUSTERW 0x0000000080000000ULL /* disable cluster write */ #define MNT_SUJ 0x0000000100000000ULL /* using journaled soft updates */ #define MNT_AUTOMOUNTED 0x0000000200000000ULL /* mounted by automountd(8) */ #define MNT_UNTRUSTED 0x0000000800000000ULL /* filesys metadata untrusted */ /* * NFS export related mount flags. */ #define MNT_EXRDONLY 0x0000000000000080ULL /* exported read only */ #define MNT_EXPORTED 0x0000000000000100ULL /* filesystem is exported */ #define MNT_DEFEXPORTED 0x0000000000000200ULL /* exported to the world */ #define MNT_EXPORTANON 0x0000000000000400ULL /* anon uid mapping for all */ #define MNT_EXKERB 0x0000000000000800ULL /* exported with Kerberos */ #define MNT_EXPUBLIC 0x0000000020000000ULL /* public export (WebNFS) */ #define MNT_EXTLS 0x0000004000000000ULL /* require TLS */ #define MNT_EXTLSCERT 0x0000008000000000ULL /* require TLS with client cert */ #define MNT_EXTLSCERTUSER 0x0000010000000000ULL /* require TLS with user cert */ /* * Flags set by internal operations, * but visible to the user. * XXX some of these are not quite right.. (I've never seen the root flag set) */ #define MNT_LOCAL 0x0000000000001000ULL /* filesystem is stored locally */ #define MNT_QUOTA 0x0000000000002000ULL /* quotas are enabled on fs */ #define MNT_ROOTFS 0x0000000000004000ULL /* identifies the root fs */ #define MNT_USER 0x0000000000008000ULL /* mounted by a user */ #define MNT_IGNORE 0x0000000000800000ULL /* do not show entry in df */ #define MNT_VERIFIED 0x0000000400000000ULL /* filesystem is verified */ /* * Mask of flags that are visible to statfs(). * XXX I think that this could now become (~(MNT_CMDFLAGS)) * but the 'mount' program may need changing to handle this. */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_UNION | MNT_SUJ | \ MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_USER | MNT_QUOTA | \ MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \ MNT_NFS4ACLS | MNT_AUTOMOUNTED | MNT_VERIFIED | \ MNT_UNTRUSTED) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ MNT_NOATIME | \ MNT_NOSYMFOLLOW | MNT_IGNORE | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS | MNT_USER | MNT_NFS4ACLS | \ MNT_AUTOMOUNTED | MNT_UNTRUSTED) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. * XXX: These are not STATES and really should be somewhere else. * XXX: MNT_BYFSID and MNT_NONBUSY collide with MNT_ACLS and MNT_MULTILABEL, * but because MNT_ACLS and MNT_MULTILABEL are only used for mount(2), * and MNT_BYFSID and MNT_NONBUSY are only used for unmount(2), * it's harmless. */ #define MNT_UPDATE 0x0000000000010000ULL /* not real mount, just update */ #define MNT_DELEXPORT 0x0000000000020000ULL /* delete export host lists */ #define MNT_RELOAD 0x0000000000040000ULL /* reload filesystem data */ #define MNT_FORCE 0x0000000000080000ULL /* force unmount or readonly */ #define MNT_SNAPSHOT 0x0000000001000000ULL /* snapshot the filesystem */ #define MNT_NONBUSY 0x0000000004000000ULL /* check vnode use counts. */ #define MNT_BYFSID 0x0000000008000000ULL /* specify filesystem by ID. */ #define MNT_NOCOVER 0x0000001000000000ULL /* Do not cover a mount point */ #define MNT_EMPTYDIR 0x0000002000000000ULL /* Only mount on empty dir */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_NONBUSY | \ MNT_BYFSID | MNT_NOCOVER | MNT_EMPTYDIR) /* * Internal filesystem control flags stored in mnt_kern_flag. * * MNTK_UNMOUNT locks the mount entry so that name lookup cannot * proceed past the mount point. This keeps the subtree stable during * mounts and unmounts. When non-forced unmount flushes all vnodes * from the mp queue, the MNTK_UNMOUNT flag prevents insmntque() from * queueing new vnodes. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_ASYNC 0x00000002 /* filtered async flag */ #define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */ #define MNTK_NOMSYNC 0x00000008 /* don't do msync */ #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */ #define MNTK_SHARED_WRITES 0x00000080 /* Allow shared locking for writes */ #define MNTK_NO_IOPF 0x00000100 /* Disallow page faults during reads and writes. Filesystem shall properly handle i/o state on EFAULT. */ #define MNTK_VGONE_UPPER 0x00000200 #define MNTK_VGONE_WAITER 0x00000400 #define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 #define MNTK_MARKER 0x00001000 #define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_USES_BCACHE 0x00004000 /* FS uses the buffer cache. */ #define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ #define MNTK_VMSETSIZE_BUG 0x00010000 #define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */ #define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */ #define MNTK_SUSPEND_ALL 0x00080000 /* Suspended by all-fs suspension */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPEND2 0x04000000 /* block secondary writes */ #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ #define MNTK_NULL_NOCACHE 0x20000000 /* auto disable cache for nullfs mounts over this fs */ #define MNTK_LOOKUP_SHARED 0x40000000 /* FS supports shared lock lookups */ #define MNTK_NOKNOTE 0x80000000 /* Don't send KNOTEs from VOP hooks */ #ifdef _KERNEL static inline int MNT_SHARED_WRITES(struct mount *mp) { return (mp != NULL && (mp->mnt_kern_flag & MNTK_SHARED_WRITES) != 0); } static inline int MNT_EXTENDED_SHARED(struct mount *mp) { return (mp != NULL && (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED) != 0); } #endif /* * Sysctl CTL_VFS definitions. * * Second level identifier specifies which filesystem. Second level * identifier VFS_VFSCONF returns information about all filesystems. * Second level identifier VFS_GENERIC is non-terminal. */ #define VFS_VFSCONF 0 /* get configured filesystems */ #define VFS_GENERIC 0 /* generic filesystem information */ /* * Third level identifiers for VFS_GENERIC are given below; third * level identifiers for specific filesystems are given in their * mount specific header files. */ #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ /* * Flags for various system call interfaces. * * waitfor flags to vfs_sync() and getfsstat() */ #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_LAZY 3 /* push data not written by filesystem syncer */ #define MNT_SUSPEND 4 /* Suspend file system after sync */ /* * Generic file handle */ struct fhandle { fsid_t fh_fsid; /* Filesystem id of mount point */ struct fid fh_fid; /* Filesys specific id */ }; typedef struct fhandle fhandle_t; /* * Old export arguments without security flavor list */ struct oexport_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ }; /* * Not quite so old export arguments with 32bit ex_flags and xucred ex_anon. */ #define MAXSECFLAVORS 5 struct o2export_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ int ex_numsecflavors; /* security flavor count */ int ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */ }; /* * Export arguments for local filesystem mount calls. */ struct export_args { uint64_t ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ uid_t ex_uid; /* mapping for anonymous user */ int ex_ngroups; gid_t *ex_groups; struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ int ex_numsecflavors; /* security flavor count */ int ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */ }; /* * Structure holding information for a publicly exported filesystem * (WebNFS). Currently the specs allow just for one such filesystem. */ struct nfs_public { int np_valid; /* Do we hold valid information */ fhandle_t np_handle; /* Filehandle for pub fs (internal) */ struct mount *np_mount; /* Mountpoint of exported fs */ char *np_index; /* Index file */ }; /* * Filesystem configuration information. One of these exists for each * type of filesystem supported by the kernel. These are searched at * mount time to identify the requested filesystem. * * XXX: Never change the first two arguments! */ struct vfsconf { u_int vfc_version; /* ABI version number */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ struct vfsops *vfc_vfsops; /* filesystem operations vector */ struct vfsops *vfc_vfsops_sd; /* ... signal-deferred */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ int vfc_prison_flag; /* prison allow.mount.* flag */ struct vfsoptdecl *vfc_opts; /* mount options */ TAILQ_ENTRY(vfsconf) vfc_list; /* list of vfscons */ }; /* Userland version of the struct vfsconf. */ struct xvfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsconf *vfc_next; /* next in list */ }; #ifndef BURN_BRIDGES struct ovfsconf { void *vfc_vfsops; char vfc_name[32]; int vfc_index; int vfc_refcount; int vfc_flags; }; #endif /* * NB: these flags refer to IMPLEMENTATION properties, not properties of * any actual mounts; i.e., it does not make sense to change the flags. */ #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ #define VFCF_NETWORK 0x00020000 /* may get data over the network */ #define VFCF_READONLY 0x00040000 /* writes are not implemented */ #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files */ #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode */ #define VFCF_JAIL 0x00400000 /* can be mounted from within a jail */ #define VFCF_DELEGADMIN 0x00800000 /* supports delegated administration */ #define VFCF_SBDRY 0x01000000 /* Stop at Boundary: defer stop requests to kernel->user (AST) transition */ typedef uint32_t fsctlop_t; struct vfsidctl { int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ fsid_t vc_fsid; /* fsid to operate on */ char vc_fstypename[MFSNAMELEN]; /* type of fs 'nfs' or '*' */ fsctlop_t vc_op; /* operation VFS_CTL_* (below) */ void *vc_ptr; /* pointer to data structure */ size_t vc_len; /* sizeof said structure */ u_int32_t vc_spare[12]; /* spare (must be zero) */ }; /* vfsidctl API version. */ #define VFS_CTL_VERS1 0x01 /* * New style VFS sysctls, do not reuse/conflict with the namespace for * private sysctls. * All "global" sysctl ops have the 33rd bit set: * 0x...1.... * Private sysctl ops should have the 33rd bit unset. */ #define VFS_CTL_QUERY 0x00010001 /* anything wrong? (vfsquery) */ #define VFS_CTL_TIMEO 0x00010002 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010003 /* disable file locking */ struct vfsquery { u_int32_t vq_flags; u_int32_t vq_spare[31]; }; /* vfsquery flags */ #define VQ_NOTRESP 0x0001 /* server down */ #define VQ_NEEDAUTH 0x0002 /* server bad auth */ #define VQ_LOWDISK 0x0004 /* we're low on space */ #define VQ_MOUNT 0x0008 /* new filesystem arrived */ #define VQ_UNMOUNT 0x0010 /* filesystem has left */ #define VQ_DEAD 0x0020 /* filesystem is dead, needs force unmount */ #define VQ_ASSIST 0x0040 /* filesystem needs assistance from external program */ #define VQ_NOTRESPLOCK 0x0080 /* server lockd down */ #define VQ_FLAG0100 0x0100 /* placeholder */ #define VQ_FLAG0200 0x0200 /* placeholder */ #define VQ_FLAG0400 0x0400 /* placeholder */ #define VQ_FLAG0800 0x0800 /* placeholder */ #define VQ_FLAG1000 0x1000 /* placeholder */ #define VQ_FLAG2000 0x2000 /* placeholder */ #define VQ_FLAG4000 0x4000 /* placeholder */ #define VQ_FLAG8000 0x8000 /* placeholder */ #ifdef _KERNEL /* Point a sysctl request at a vfsidctl's data. */ #define VCTLTOREQ(vc, req) \ do { \ (req)->newptr = (vc)->vc_ptr; \ (req)->newlen = (vc)->vc_len; \ (req)->newidx = 0; \ } while (0) #endif struct iovec; struct uio; #ifdef _KERNEL /* * vfs_busy specific flags and mask. */ #define MBF_NOWAIT 0x01 #define MBF_MNTLSTLOCK 0x02 #define MBF_MASK (MBF_NOWAIT | MBF_MNTLSTLOCK) #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MOUNT); MALLOC_DECLARE(M_STATFS); #endif extern int maxvfsconf; /* highest defined filesystem type */ TAILQ_HEAD(vfsconfhead, vfsconf); extern struct vfsconfhead vfsconf; /* * Operations supported on mounted filesystem. */ struct mount_args; struct nameidata; struct sysctl_req; struct mntarg; /* * N.B., vfs_cmount is the ancient vfsop invoked by the old mount(2) syscall. * The new way is vfs_mount. * * vfs_cmount implementations typically translate arguments from their * respective old per-FS structures into the key-value list supported by * nmount(2), then use kernel_mount(9) to mimic nmount(2) from kernelspace. * * Filesystems with mounters that use nmount(2) do not need to and should not * implement vfs_cmount. Hopefully a future cleanup can remove vfs_cmount and * mount(2) entirely. */ typedef int vfs_cmount_t(struct mntarg *ma, void *data, uint64_t flags); typedef int vfs_unmount_t(struct mount *mp, int mntflags); typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp); typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg); typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp); typedef int vfs_sync_t(struct mount *mp, int waitfor); typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp); typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors); typedef int vfs_init_t(struct vfsconf *); typedef int vfs_uninit_t(struct vfsconf *); typedef int vfs_extattrctl_t(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname); typedef int vfs_mount_t(struct mount *mp); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp); typedef void vfs_purge_t(struct mount *mp); struct vfsops { vfs_mount_t *vfs_mount; vfs_cmount_t *vfs_cmount; vfs_unmount_t *vfs_unmount; vfs_root_t *vfs_root; vfs_root_t *vfs_cachedroot; vfs_quotactl_t *vfs_quotactl; vfs_statfs_t *vfs_statfs; vfs_sync_t *vfs_sync; vfs_vget_t *vfs_vget; vfs_fhtovp_t *vfs_fhtovp; vfs_checkexp_t *vfs_checkexp; vfs_init_t *vfs_init; vfs_uninit_t *vfs_uninit; vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; vfs_notify_lowervp_t *vfs_reclaim_lowervp; vfs_notify_lowervp_t *vfs_unlink_lowervp; vfs_purge_t *vfs_purge; vfs_mount_t *vfs_spare[6]; /* spares for ABI compat */ }; vfs_statfs_t __vfs_statfs; #define VFS_MOUNT(MP) ({ \ int _rc; \ \ TSRAW(curthread, TS_ENTER, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\ _rc = (*(MP)->mnt_op->vfs_mount)(MP); \ TSRAW(curthread, TS_EXIT, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\ _rc; }) #define VFS_UNMOUNT(MP, FORCE) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_unmount)(MP, FORCE); \ _rc; }) #define VFS_ROOT(MP, FLAGS, VPP) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP); \ _rc; }) #define VFS_CACHEDROOT(MP, FLAGS, VPP) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_cachedroot)(MP, FLAGS, VPP); \ _rc; }) #define VFS_QUOTACTL(MP, C, U, A) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A); \ _rc; }) #define VFS_STATFS(MP, SBP) ({ \ int _rc; \ \ _rc = __vfs_statfs((MP), (SBP)); \ _rc; }) #define VFS_SYNC(MP, WAIT) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_sync)(MP, WAIT); \ _rc; }) #define VFS_VGET(MP, INO, FLAGS, VPP) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP); \ _rc; }) #define VFS_FHTOVP(MP, FIDP, FLAGS, VPP) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, FLAGS, VPP); \ _rc; }) #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED, NUMSEC, SEC) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED, NUMSEC,\ SEC); \ _rc; }) #define VFS_EXTATTRCTL(MP, C, FN, NS, N) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N); \ _rc; }) #define VFS_SYSCTL(MP, OP, REQ) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ); \ _rc; }) #define VFS_SUSP_CLEAN(MP) do { \ if (*(MP)->mnt_op->vfs_susp_clean != NULL) { \ (*(MP)->mnt_op->vfs_susp_clean)(MP); \ } \ } while (0) #define VFS_RECLAIM_LOWERVP(MP, VP) do { \ if (*(MP)->mnt_op->vfs_reclaim_lowervp != NULL) { \ (*(MP)->mnt_op->vfs_reclaim_lowervp)((MP), (VP)); \ } \ } while (0) #define VFS_UNLINK_LOWERVP(MP, VP) do { \ if (*(MP)->mnt_op->vfs_unlink_lowervp != NULL) { \ (*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP)); \ } \ } while (0) #define VFS_PURGE(MP) do { \ if (*(MP)->mnt_op->vfs_purge != NULL) { \ (*(MP)->mnt_op->vfs_purge)(MP); \ } \ } while (0) #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), KNF_LISTLOCKED); \ } while (0) #define VFS_KNOTE_UNLOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 0); \ } while (0) #define VFS_NOTIFY_UPPER_RECLAIM 1 #define VFS_NOTIFY_UPPER_UNLINK 2 #include /* * Version numbers. */ #define VFS_VERSION_00 0x19660120 #define VFS_VERSION_01 0x20121030 #define VFS_VERSION_02 0x20180504 #define VFS_VERSION VFS_VERSION_02 #define VFS_SET(vfsops, fsname, flags) \ static struct vfsconf fsname ## _vfsconf = { \ .vfc_version = VFS_VERSION, \ .vfc_name = #fsname, \ .vfc_vfsops = &vfsops, \ .vfc_typenum = -1, \ .vfc_flags = flags, \ }; \ static moduledata_t fsname ## _mod = { \ #fsname, \ vfs_modevent, \ & fsname ## _vfsconf \ }; \ DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) /* * exported vnode operations */ int dounmount(struct mount *, int, struct thread *); int kernel_mount(struct mntarg *ma, uint64_t flags); int kernel_vmount(int flags, ...); struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len); struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name); struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...); struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len); void statfs_scale_blocks(struct statfs *sf, long max_size); struct vfsconf *vfs_byname(const char *); struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *); void vfs_mount_destroy(struct mount *); void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); void vfs_freeopts(struct vfsoptlist *opts); void vfs_deleteopt(struct vfsoptlist *opts, const char *name); int vfs_buildopts(struct uio *auio, struct vfsoptlist **options); int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val); int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); int vfs_getopt_pos(struct vfsoptlist *opts, const char *name); int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value); char *vfs_getopts(struct vfsoptlist *, const char *, int *error); int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); int vfs_filteropt(struct vfsoptlist *, const char **legal); void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...); int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...); int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len); int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len); int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); void vfs_periodic(struct mount *, int); int vfs_busy(struct mount *, int); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); void vfs_allocate_syncvnode(struct mount *); void vfs_deallocate_syncvnode(struct mount *); int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions); void vfs_getnewfsid(struct mount *); struct cdev *vfs_getrootfsid(struct mount *); struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ struct mount *vfs_busyfs(fsid_t *); int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); void vfs_notify_upper(struct vnode *, int); struct mount *vfs_ref_from_vp(struct vnode *); void vfs_ref(struct mount *); void vfs_rel(struct mount *); struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *, struct ucred *); int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx_padalign mountlist_mtx; extern struct nfs_public nfs_pub; extern struct sx vfsconf_sx; #define vfsconf_lock() sx_xlock(&vfsconf_sx) #define vfsconf_unlock() sx_xunlock(&vfsconf_sx) #define vfsconf_slock() sx_slock(&vfsconf_sx) #define vfsconf_sunlock() sx_sunlock(&vfsconf_sx) struct vnode *mntfs_allocvp(struct mount *, struct vnode *); void mntfs_freevp(struct vnode *); /* * Declarations for these vfs default operations are located in * kern/vfs_default.c. They will be automatically used to replace * null entries in VFS ops tables when registering a new filesystem * type in the global table. */ vfs_root_t vfs_stdroot; vfs_quotactl_t vfs_stdquotactl; vfs_statfs_t vfs_stdstatfs; vfs_sync_t vfs_stdsync; vfs_sync_t vfs_stdnosync; vfs_vget_t vfs_stdvget; vfs_fhtovp_t vfs_stdfhtovp; vfs_checkexp_t vfs_stdcheckexp; vfs_init_t vfs_stdinit; vfs_uninit_t vfs_stduninit; vfs_extattrctl_t vfs_stdextattrctl; vfs_sysctl_t vfs_stdsysctl; void syncer_suspend(void); void syncer_resume(void); struct vnode *vfs_cache_root_clear(struct mount *); void vfs_cache_root_set(struct mount *, struct vnode *); void vfs_op_barrier_wait(struct mount *); void vfs_op_enter(struct mount *); void vfs_op_exit_locked(struct mount *); void vfs_op_exit(struct mount *); #ifdef DIAGNOSTIC void vfs_assert_mount_counters(struct mount *); void vfs_dump_mount_counters(struct mount *); #else #define vfs_assert_mount_counters(mp) do { } while (0) #define vfs_dump_mount_counters(mp) do { } while (0) #endif enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT }; int vfs_mount_fetch_counter(struct mount *, enum mount_counter); void suspend_all_fs(void); void resume_all_fs(void); /* * Code transitioning mnt_vfs_ops to > 0 issues IPIs until it observes * all CPUs not executing code enclosed by thread_in_ops_pcpu variable. * * This provides an invariant that by the time the last CPU is observed not * executing, everyone else entering will see the counter > 0 and exit. * * Note there is no barrier between vfs_ops and the rest of the code in the * section. It is not necessary as the writer has to wait for everyone to drain * before making any changes or only make changes safe while the section is * executed. */ #define vfs_mount_pcpu(mp) zpcpu_get(mp->mnt_pcpu) #define vfs_mount_pcpu_remote(mp, cpu) zpcpu_get_cpu(mp->mnt_pcpu, cpu) #define vfs_op_thread_entered(mp) ({ \ MPASS(curthread->td_critnest > 0); \ struct mount_pcpu *_mpcpu = vfs_mount_pcpu(mp); \ _mpcpu->mntp_thread_in_ops == 1; \ }) #define vfs_op_thread_enter_crit(mp, _mpcpu) ({ \ bool _retval_crit = true; \ MPASS(curthread->td_critnest > 0); \ _mpcpu = vfs_mount_pcpu(mp); \ MPASS(mpcpu->mntp_thread_in_ops == 0); \ _mpcpu->mntp_thread_in_ops = 1; \ atomic_interrupt_fence(); \ if (__predict_false(mp->mnt_vfs_ops > 0)) { \ vfs_op_thread_exit_crit(mp, _mpcpu); \ _retval_crit = false; \ } \ _retval_crit; \ }) #define vfs_op_thread_enter(mp, _mpcpu) ({ \ bool _retval; \ critical_enter(); \ _retval = vfs_op_thread_enter_crit(mp, _mpcpu); \ if (__predict_false(!_retval)) \ critical_exit(); \ _retval; \ }) #define vfs_op_thread_exit_crit(mp, _mpcpu) do { \ MPASS(_mpcpu == vfs_mount_pcpu(mp)); \ MPASS(_mpcpu->mntp_thread_in_ops == 1); \ atomic_interrupt_fence(); \ _mpcpu->mntp_thread_in_ops = 0; \ } while (0) #define vfs_op_thread_exit(mp, _mpcpu) do { \ vfs_op_thread_exit_crit(mp, _mpcpu); \ critical_exit(); \ } while (0) #define vfs_mp_count_add_pcpu(_mpcpu, count, val) do { \ MPASS(_mpcpu->mntp_thread_in_ops == 1); \ _mpcpu->mntp_##count += val; \ } while (0) #define vfs_mp_count_sub_pcpu(_mpcpu, count, val) do { \ MPASS(_mpcpu->mntp_thread_in_ops == 1); \ _mpcpu->mntp_##count -= val; \ } while (0) #else /* !_KERNEL */ #include struct stat; __BEGIN_DECLS int fhlink(struct fhandle *, const char *); int fhlinkat(struct fhandle *, int, const char *); int fhopen(const struct fhandle *, int); int fhreadlink(struct fhandle *, char *, size_t); int fhstat(const struct fhandle *, struct stat *); int fhstatfs(const struct fhandle *, struct statfs *); int fstatfs(int, struct statfs *); int getfh(const char *, fhandle_t *); int getfhat(int, char *, struct fhandle *, int); int getfsstat(struct statfs *, long, int); int getmntinfo(struct statfs **, int); int lgetfh(const char *, fhandle_t *); int mount(const char *, const char *, int, void *); int nmount(struct iovec *, unsigned int, int); int statfs(const char *, struct statfs *); int unmount(const char *, int); /* C library stuff */ int getvfsbyname(const char *, struct xvfsconf *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_MOUNT_H_ */ diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 7bb532e7d26c..12ab3ca0a5e1 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -1,815 +1,815 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #include #include #include static int ffs_indirtrunc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, ufs2_daddr_t *); static void ffs_inode_bwrite(struct vnode *vp, struct buf *bp, int flags) { if ((flags & IO_SYNC) != 0) bwrite(bp); else if (DOINGASYNC(vp)) bdwrite(bp); else bawrite(bp); } /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode * to disk if the IN_MODIFIED flag is set (it may be set initially, or by * the timestamp update). The IN_LAZYMOD flag is set to force a write * later if not now. The IN_LAZYACCESS is set instead of IN_MODIFIED if the fs * is currently being suspended (or is suspended) and vnode has been accessed. * If we write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to * reflect the presumably successful write, and if waitfor is set, then wait * for the write to complete. */ int ffs_update(vp, waitfor) struct vnode *vp; int waitfor; { struct fs *fs; struct buf *bp; struct inode *ip; daddr_t bn; int flags, error; ASSERT_VOP_ELOCKED(vp, "ffs_update"); ufs_itimes(vp); ip = VTOI(vp); if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); /* * The IN_SIZEMOD and IN_IBLKDATA flags indicate changes to the * file size and block pointer fields in the inode. When these * fields have been changed, the fsync() and fsyncdata() system * calls must write the inode to ensure their semantics that the * file is on stable store. * * The IN_SIZEMOD and IN_IBLKDATA flags cannot be cleared until * a synchronous write of the inode is done. If they are cleared * on an asynchronous write, then the inode may not yet have been * written to the disk when an fsync() or fsyncdata() call is done. * Absent these flags, these calls would not know that they needed * to write the inode. Thus, these flags only can be cleared on * synchronous writes of the inode. Since the inode will be locked * for the duration of the I/O that writes it to disk, no fsync() * or fsyncdata() will be able to run before the on-disk inode * is complete. */ if (waitfor) ip->i_flag &= ~(IN_SIZEMOD | IN_IBLKDATA); fs = ITOFS(ip); if (fs->fs_ronly && ITOUMP(ip)->um_fsckpid == 0) return (0); /* * If we are updating a snapshot and another process is currently * writing the buffer containing the inode for this snapshot then * a deadlock can occur when it tries to check the snapshot to see * if that block needs to be copied. Thus when updating a snapshot * we check to see if the buffer is already locked, and if it is * we drop the snapshot lock until the buffer has been written * and is available to us. We have to grab a reference to the * snapshot vnode to prevent it from being removed while we are * waiting for the buffer. */ flags = 0; if (IS_SNAPSHOT(ip)) flags = GB_LOCK_NOWAIT; loop: bn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number)); error = ffs_breadz(VFSTOUFS(vp->v_mount), ITODEVVP(ip), bn, bn, (int) fs->fs_bsize, NULL, NULL, 0, NOCRED, flags, NULL, &bp); if (error != 0) { if (error != EBUSY) return (error); KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot")); /* * Wait for our inode block to become available. * * Hold a reference to the vnode to protect against * ffs_snapgone(). Since we hold a reference, it can only * get reclaimed (VIRF_DOOMED flag) in a forcible downgrade * or unmount. For an unmount, the entire filesystem will be * gone, so we cannot attempt to touch anything associated * with it while the vnode is unlocked; all we can do is * pause briefly and try again. If when we relock the vnode * we discover that it has been reclaimed, updating it is no * longer necessary and we can just return an error. */ vref(vp); VOP_UNLOCK(vp); pause("ffsupd", 1); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrele(vp); if (VN_IS_DOOMED(vp)) return (ENOENT); goto loop; } if (DOINGSOFTDEP(vp)) softdep_update_inodeblock(ip, bp, waitfor); else if (ip->i_effnlink != ip->i_nlink) panic("ffs_update: bad link cnt"); if (I_IS_UFS1(ip)) { *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; /* * XXX: FIX? The entropy here is desirable, * but the harvesting may be expensive */ random_harvest_queue(&(ip->i_din1), sizeof(ip->i_din1), RANDOM_FS_ATIME); } else { ffs_update_dinode_ckhash(fs, ip->i_din2); *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; /* * XXX: FIX? The entropy here is desirable, * but the harvesting may be expensive */ random_harvest_queue(&(ip->i_din2), sizeof(ip->i_din2), RANDOM_FS_ATIME); } if (waitfor) { error = bwrite(bp); if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error)) error = 0; } else if (vm_page_count_severe() || buf_dirty_count_severe()) { bawrite(bp); error = 0; } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); error = 0; } return (error); } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode ip to at most length size, freeing the * disk blocks. */ int ffs_truncate(vp, length, flags, cred) struct vnode *vp; off_t length; int flags; struct ucred *cred; { struct inode *ip; ufs2_daddr_t bn, lbn, lastblock, lastiblock[UFS_NIADDR]; ufs2_daddr_t indir_lbn[UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR]; ufs2_daddr_t newblks[UFS_NDADDR + UFS_NIADDR]; ufs2_daddr_t count, blocksreleased = 0, datablocks, blkno; struct bufobj *bo; struct fs *fs; struct buf *bp; struct ufsmount *ump; int softdeptrunc, journaltrunc; int needextclean, extblocks; int offset, size, level, nblocks; int i, error, allerror, indiroff, waitforupdate; u_long key; off_t osize; ip = VTOI(vp); ump = VFSTOUFS(vp->v_mount); fs = ump->um_fs; bo = &vp->v_bufobj; ASSERT_VOP_LOCKED(vp, "ffs_truncate"); if (length < 0) return (EINVAL); if (length > fs->fs_maxfilesize) return (EFBIG); #ifdef QUOTA error = getinoquota(ip); if (error) return (error); #endif /* * Historically clients did not have to specify which data * they were truncating. So, if not specified, we assume * traditional behavior, e.g., just the normal data. */ if ((flags & (IO_EXT | IO_NORMAL)) == 0) flags |= IO_NORMAL; if (!DOINGSOFTDEP(vp) && !DOINGASYNC(vp)) flags |= IO_SYNC; waitforupdate = (flags & IO_SYNC) != 0 || !DOINGASYNC(vp); /* * If we are truncating the extended-attributes, and cannot * do it with soft updates, then do it slowly here. If we are * truncating both the extended attributes and the file contents * (e.g., the file is being unlinked), then pick it off with * soft updates below. */ allerror = 0; needextclean = 0; softdeptrunc = 0; journaltrunc = DOINGSUJ(vp); journaltrunc = 0; /* XXX temp patch until bug found */ if (journaltrunc == 0 && DOINGSOFTDEP(vp) && length == 0) softdeptrunc = !softdep_slowdown(vp); extblocks = 0; datablocks = DIP(ip, i_blocks); if (fs->fs_magic == FS_UFS2_MAGIC && ip->i_din2->di_extsize > 0) { extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks -= extblocks; } if ((flags & IO_EXT) && extblocks > 0) { if (length != 0) panic("ffs_truncate: partial trunc of extdata"); if (softdeptrunc || journaltrunc) { if ((flags & IO_NORMAL) == 0) goto extclean; needextclean = 1; } else { if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); #ifdef QUOTA (void) chkdq(ip, -extblocks, NOCRED, FORCE); #endif vinvalbuf(vp, V_ALT, 0, 0); vn_pages_remove(vp, OFF_TO_IDX(lblktosize(fs, -extblocks)), 0); osize = ip->i_din2->di_extsize; ip->i_din2->di_blocks -= extblocks; ip->i_din2->di_extsize = 0; for (i = 0; i < UFS_NXADDR; i++) { oldblks[i] = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; } UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); if ((error = ffs_update(vp, waitforupdate))) return (error); for (i = 0; i < UFS_NXADDR; i++) { if (oldblks[i] == 0) continue; ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i], sblksize(fs, osize, i), ip->i_number, vp->v_type, NULL, SINGLETON_KEY); } } } if ((flags & IO_NORMAL) == 0) return (0); - if (vp->v_type == VLNK && ip->i_size < vp->v_mount->mnt_maxsymlinklen) { + if (vp->v_type == VLNK && ip->i_size < ump->um_maxsymlinklen) { #ifdef INVARIANTS if (length != 0) panic("ffs_truncate: partial truncate of symlink"); #endif bzero(SHORTLINK(ip), (u_int)ip->i_size); ip->i_size = 0; DIP_SET(ip, i_size, 0); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); if (needextclean) goto extclean; return (ffs_update(vp, waitforupdate)); } if (ip->i_size == length) { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); if (needextclean) goto extclean; return (ffs_update(vp, 0)); } if (fs->fs_ronly) panic("ffs_truncate: read-only filesystem"); if (IS_SNAPSHOT(ip)) ffs_snapremove(vp); vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; osize = ip->i_size; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { vnode_pager_setsize(vp, length); flags |= BA_CLRBUF; error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) { vnode_pager_setsize(vp, osize); return (error); } ip->i_size = length; DIP_SET(ip, i_size, length); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; ffs_inode_bwrite(vp, bp, flags); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); return (ffs_update(vp, waitforupdate)); } /* * Lookup block number for a given offset. Zero length files * have no blocks, so return a blkno of -1. */ lbn = lblkno(fs, length - 1); if (length == 0) { blkno = -1; } else if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, cred, BA_METAONLY, &bp); if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; else blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; /* * If the block number is non-zero, then the indirect block * must have been previously allocated and need not be written. * If the block number is zero, then we may have allocated * the indirect block and hence need to write it out. */ if (blkno != 0) brelse(bp); else if (flags & IO_SYNC) bwrite(bp); else bdwrite(bp); } /* * If the block number at the new end of the file is zero, * then we must allocate it to ensure that the last block of * the file is allocated. Soft updates does not handle this * case, so here we have to clean up the soft updates data * structures describing the allocation past the truncation * point. Finding and deallocating those structures is a lot of * work. Since partial truncation with a hole at the end occurs * rarely, we solve the problem by syncing the file so that it * will have no soft updates data structures left. */ if (blkno == 0 && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); if (blkno != 0 && DOINGSOFTDEP(vp)) { if (softdeptrunc == 0 && journaltrunc == 0) { /* * If soft updates cannot handle this truncation, * clean up soft dependency data structures and * fall through to the synchronous truncation. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); } else { flags = IO_NORMAL | (needextclean ? IO_EXT: 0); if (journaltrunc) softdep_journal_freeblocks(ip, cred, length, flags); else softdep_setup_freeblocks(ip, length, flags); ASSERT_VOP_LOCKED(vp, "ffs_truncate1"); if (journaltrunc == 0) { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); error = ffs_update(vp, 0); } return (error); } } /* * Shorten the size of the file. If the last block of the * shortened file is unallocated, we must allocate it. * Additionally, if the file is not being truncated to a * block boundary, the contents of the partial block * following the end of the file must be zero'ed in * case it ever becomes accessible again because of * subsequent file growth. Directories however are not * zero'ed as they should grow back initialized to empty. */ offset = blkoff(fs, length); if (blkno != 0 && offset == 0) { ip->i_size = length; DIP_SET(ip, i_size, length); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); #ifdef UFS_DIRHASH if (vp->v_type == VDIR && ip->i_dirhash != NULL) ufsdirhash_dirtrunc(ip, length); #endif } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) return (error); ffs_inode_bwrite(vp, bp, flags); /* * When we are doing soft updates and the UFS_BALLOC * above fills in a direct block hole with a full sized * block that will be truncated down to a fragment below, * we must flush out the block dependency with an FSYNC * so that we do not get a soft updates inconsistency * when we create the fragment below. */ if (DOINGSOFTDEP(vp) && lbn < UFS_NDADDR && fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) return (error); ip->i_size = length; DIP_SET(ip, i_size, length); #ifdef UFS_DIRHASH if (vp->v_type == VDIR && ip->i_dirhash != NULL) ufsdirhash_dirtrunc(ip, length); #endif size = blksize(fs, ip, lbn); if (vp->v_type != VDIR && offset != 0) bzero((char *)bp->b_data + offset, (u_int)(size - offset)); /* Kirk's code has reallocbuf(bp, size, 1) here */ allocbuf(bp, size); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; ffs_inode_bwrite(vp, bp, flags); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastiblock[SINGLE] = lastblock - UFS_NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ffs_indirtrunc below. */ for (level = TRIPLE; level >= SINGLE; level--) { oldblks[UFS_NDADDR + level] = DIP(ip, i_ib[level]); if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); lastiblock[level] = -1; } } for (i = 0; i < UFS_NDADDR; i++) { oldblks[i] = DIP(ip, i_db[i]); if (i > lastblock) DIP_SET(ip, i_db[i], 0); } UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); allerror = ffs_update(vp, waitforupdate); /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ for (i = 0; i < UFS_NDADDR; i++) { newblks[i] = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], oldblks[i]); } for (i = 0; i < UFS_NIADDR; i++) { newblks[UFS_NDADDR + i] = DIP(ip, i_ib[i]); DIP_SET(ip, i_ib[i], oldblks[UFS_NDADDR + i]); } ip->i_size = osize; DIP_SET(ip, i_size, osize); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); error = vtruncbuf(vp, length, fs->fs_bsize); if (error && (allerror == 0)) allerror = error; /* * Indirect blocks first. */ indir_lbn[SINGLE] = -UFS_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = DIP(ip, i_ib[level]); if (bn != 0) { error = ffs_indirtrunc(ip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ump->um_devvp, bn, fs->fs_bsize, ip->i_number, vp->v_type, NULL, SINGLETON_KEY); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ key = ffs_blkrelease_start(ump, ump->um_devvp, ip->i_number); for (i = UFS_NDADDR - 1; i > lastblock; i--) { long bsize; bn = DIP(ip, i_db[i]); if (bn == 0) continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number, vp->v_type, NULL, key); blocksreleased += btodb(bsize); } ffs_blkrelease_finish(ump, key); if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = DIP(ip, i_db[lastblock]); if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, ip, lastblock); ip->i_size = length; DIP_SET(ip, i_size, length); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); newspace = blksize(fs, ip, lastblock); if (newspace == 0) panic("ffs_truncate: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ump->um_devvp, bn, oldspace - newspace, ip->i_number, vp->v_type, NULL, SINGLETON_KEY); blocksreleased += btodb(oldspace - newspace); } } done: #ifdef INVARIANTS for (level = SINGLE; level <= TRIPLE; level++) if (newblks[UFS_NDADDR + level] != DIP(ip, i_ib[level])) panic("ffs_truncate1: level %d newblks %jd != i_ib %jd", level, (intmax_t)newblks[UFS_NDADDR + level], (intmax_t)DIP(ip, i_ib[level])); for (i = 0; i < UFS_NDADDR; i++) if (newblks[i] != DIP(ip, i_db[i])) panic("ffs_truncate2: blkno %d newblks %jd != i_db %jd", i, (intmax_t)newblks[UFS_NDADDR + level], (intmax_t)DIP(ip, i_ib[level])); BO_LOCK(bo); if (length == 0 && (fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("ffs_truncate3: vp = %p, buffers: dirty = %d, clean = %d", vp, bo->bo_dirty.bv_cnt, bo->bo_clean.bv_cnt); BO_UNLOCK(bo); #endif /* INVARIANTS */ /* * Put back the real size. */ ip->i_size = length; DIP_SET(ip, i_size, length); if (DIP(ip, i_blocks) >= blocksreleased) DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - blocksreleased); else /* sanity */ DIP_SET(ip, i_blocks, 0); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); #ifdef QUOTA (void) chkdq(ip, -blocksreleased, NOCRED, FORCE); #endif return (allerror); extclean: if (journaltrunc) softdep_journal_freeblocks(ip, cred, length, IO_EXT); else softdep_setup_freeblocks(ip, length, IO_EXT); return (ffs_update(vp, waitforupdate)); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ static int ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) struct inode *ip; ufs2_daddr_t lbn, lastbn; ufs2_daddr_t dbn; int level; ufs2_daddr_t *countp; { struct buf *bp; struct fs *fs; struct ufsmount *ump; struct vnode *vp; caddr_t copy = NULL; u_long key; int i, nblocks, error = 0, allerror = 0; ufs2_daddr_t nb, nlbn, last; ufs2_daddr_t blkcount, factor, blocksreleased = 0; ufs1_daddr_t *bap1 = NULL; ufs2_daddr_t *bap2 = NULL; #define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i]) fs = ITOFS(ip); ump = ITOUMP(ip); /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = lbn_offset(fs, level); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to VOP_BMAP() on these blocks will fail. However, we already * have the on-disk address, so we just pass it to bread() instead * of having bread() attempt to calculate it using VOP_BMAP(). */ vp = ITOV(ip); error = ffs_breadz(ump, vp, lbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error) { *countp = 0; return (error); } if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)bp->b_data; else bap2 = (ufs2_daddr_t *)bp->b_data; if (lastbn != -1) { copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bp->b_data, copy, (u_int)fs->fs_bsize); for (i = last + 1; i < NINDIR(fs); i++) if (I_IS_UFS1(ip)) bap1[i] = 0; else bap2[i] = 0; if (DOINGASYNC(vp)) { bdwrite(bp); } else { error = bwrite(bp); if (error) allerror = error; } if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)copy; else bap2 = (ufs2_daddr_t *)copy; } /* * Recursively free totally unused blocks. */ key = ffs_blkrelease_start(ump, ITODEVVP(ip), ip->i_number); for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = BAP(ip, i); if (nb == 0) continue; if (level > SINGLE) { if ((error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (ufs2_daddr_t)-1, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } ffs_blkfree(ump, fs, ITODEVVP(ip), nb, fs->fs_bsize, ip->i_number, vp->v_type, NULL, key); blocksreleased += nblocks; } ffs_blkrelease_finish(ump, key); /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = BAP(ip, i); if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount); if (error) allerror = error; blocksreleased += blkcount; } } if (copy != NULL) { free(copy, M_TEMP); } else { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } *countp = blocksreleased; return (allerror); } int ffs_rdonly(struct inode *ip) { return (ITOFS(ip)->fs_ronly != 0); } diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 540dd02c9631..c8073874a85f 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1,2724 +1,2724 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; VFS_SMR_DECLARE; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size); static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = vfs_cache_root, .vfs_cachedroot = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", "untrusted", NULL }; static int ffs_enxio_enable = 1; SYSCTL_DECL(_vfs_ffs); SYSCTL_INT(_vfs_ffs, OID_AUTO, enxio_enable, CTLFLAG_RWTUN, &ffs_enxio_enable, 0, "enable mapping of other disk I/O errors to ENXIO"); /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ static int ffs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp) { struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; int bsize, error; ip = VTOI(vp); fs = ITOFS(ip); lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); *bpp = NULL; error = bread(vp, lbn, bsize, NOCRED, &bp); if (error) { return (error); } if (res) *res = (char *)bp->b_data + blkoff(fs, offset); *bpp = bp; return (0); } /* * Load up the contents of an inode and copy the appropriate pieces * to the incore copy. */ static int ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino) { struct ufs1_dinode *dip1; struct ufs2_dinode *dip2; int error; if (I_IS_UFS1(ip)) { dip1 = ip->i_din1; *dip1 = *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = dip1->di_mode; ip->i_nlink = dip1->di_nlink; ip->i_effnlink = dip1->di_nlink; ip->i_size = dip1->di_size; ip->i_flags = dip1->di_flags; ip->i_gen = dip1->di_gen; ip->i_uid = dip1->di_uid; ip->i_gid = dip1->di_gid; return (0); } dip2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); if ((error = ffs_verify_dinode_ckhash(fs, dip2)) != 0 && !ffs_fsfail_cleanup(ITOUMP(ip), error)) { printf("%s: inode %jd: check-hash failed\n", fs->fs_fsmnt, (intmax_t)ino); return (error); } *ip->i_din2 = *dip2; dip2 = ip->i_din2; ip->i_mode = dip2->di_mode; ip->i_nlink = dip2->di_nlink; ip->i_effnlink = dip2->di_nlink; ip->i_size = dip2->di_size; ip->i_flags = dip2->di_flags; ip->i_gen = dip2->di_gen; ip->i_uid = dip2->di_uid; ip->i_gid = dip2->di_gid; return (0); } /* * Verify that a filesystem block number is a valid data block. * This routine is only called on untrusted filesystems. */ static int ffs_check_blkno(struct mount *mp, ino_t inum, ufs2_daddr_t daddr, int blksize) { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t end_daddr; int cg, havemtx; KASSERT((mp->mnt_flag & MNT_UNTRUSTED) != 0, ("ffs_check_blkno called on a trusted file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; cg = dtog(fs, daddr); end_daddr = daddr + numfrags(fs, blksize); /* * Verify that the block number is a valid data block. Also check * that it does not point to an inode block or a superblock. Accept * blocks that are unalloacted (0) or part of snapshot metadata * (BLK_NOCOPY or BLK_SNAP). * * Thus, the block must be in a valid range for the filesystem and * either in the space before a backup superblock (except the first * cylinder group where that space is used by the bootstrap code) or * after the inode blocks and before the end of the cylinder group. */ if ((uint64_t)daddr <= BLK_SNAP || ((uint64_t)end_daddr <= fs->fs_size && ((cg > 0 && end_daddr <= cgsblock(fs, cg)) || (daddr >= cgdmin(fs, cg) && end_daddr <= cgbase(fs, cg) + fs->fs_fpg)))) return (0); if ((havemtx = mtx_owned(UFS_MTX(ump))) == 0) UFS_LOCK(ump); if (ppsratecheck(&ump->um_last_integritymsg, &ump->um_secs_integritymsg, 1)) { UFS_UNLOCK(ump); uprintf("\n%s: inode %jd, out-of-range indirect block " "number %jd\n", mp->mnt_stat.f_mntonname, inum, daddr); if (havemtx) UFS_LOCK(ump); } else if (!havemtx) UFS_UNLOCK(ump); return (EINTEGRITY); } /* * Initiate a forcible unmount. * Used to unmount filesystems whose underlying media has gone away. */ static void ffs_fsfail_unmount(void *v, int pending) { struct fsfail_task *etp; struct mount *mp; etp = v; /* * Find our mount and get a ref on it, then try to unmount. */ mp = vfs_getvfs(&etp->fsid); if (mp != NULL) dounmount(mp, MNT_FORCE, curthread); free(etp, M_UFSMNT); } /* * On first ENXIO error, start a task that forcibly unmounts the filesystem. * * Return true if a cleanup is in progress. */ int ffs_fsfail_cleanup(struct ufsmount *ump, int error) { int retval; UFS_LOCK(ump); retval = ffs_fsfail_cleanup_locked(ump, error); UFS_UNLOCK(ump); return (retval); } int ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error) { struct fsfail_task *etp; struct task *tp; mtx_assert(UFS_MTX(ump), MA_OWNED); if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) { ump->um_flags |= UM_FSFAIL_CLEANUP; /* * Queue an async forced unmount. */ etp = ump->um_fsfail_task; ump->um_fsfail_task = NULL; if (etp != NULL) { tp = &etp->task; TASK_INIT(tp, 0, ffs_fsfail_unmount, etp); taskqueue_enqueue(taskqueue_thread, tp); printf("UFS: forcibly unmounting %s from %s\n", ump->um_mountp->mnt_stat.f_mntfromname, ump->um_mountp->mnt_stat.f_mntonname); } } return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0); } /* * Wrapper used during ENXIO cleanup to allocate empty buffers when * the kernel is unable to read the real one. They are needed so that * the soft updates code can use them to unwind its dependencies. */ int ffs_breadz(struct ufsmount *ump, struct vnode *vp, daddr_t lblkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { int error; flags |= GB_CVTENXIO; error = breadn_flags(vp, lblkno, dblkno, size, rablkno, rabsize, cnt, cred, flags, ckhashfunc, bpp); if (error != 0 && ffs_fsfail_cleanup(ump, error)) { error = getblkx(vp, lblkno, dblkno, size, 0, 0, flags, bpp); KASSERT(error == 0, ("getblkx failed")); vfs_bio_bzero_buf(*bpp, 0, size); } return (error); } static int ffs_mount(struct mount *mp) { struct vnode *devvp, *odevvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; pid_t fsckpid = 0; int error, error1, flags; uint64_t mntorflags, saved_mnt_flag; accmode_t accmode; struct nameidata ndp; char *fspec; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); VFS_SMR_ZONE_SET(uma_inode); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0) mntorflags |= MNT_UNTRUSTED; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { /* * Once we have set the restricted PID, do not * persist "fsckpid" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "fsckpid"); vfs_deleteopt(mp->mnt_opt, "fsckpid"); if (mp->mnt_flag & MNT_UPDATE) { if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } /* Set to -1 if we are done */ if (fsckpid == 0) fsckpid = -1; } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_FPLOOKUP; mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; odevvp = ump->um_odevvp; devvp = ump->um_devvp; if (fsckpid == -1 && ump->um_fsckpid > 0) { if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) return (error); g_topology_lock(); /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); ump->um_fsckpid = 0; } if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (MOUNTEDSOFTDEP(mp)) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vfs_write_resume(mp, 0); return (error); } if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If we are running a checker, do not allow upgrade. */ if (ump->um_fsckpid > 0) { vfs_mount_error(mp, "Active checker, cannot upgrade to write"); return (EINVAL); } /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(odevvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); VOP_UNLOCK(odevvp); if (error) { return (error); } fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); saved_mnt_flag = MNT_RDONLY; if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag & MNT_ASYNC) != 0) saved_mnt_flag |= MNT_ASYNC; mp->mnt_flag &= ~saved_mnt_flag; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vfs_write_resume(mp, 0); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * If this is a request from fsck to clean up the filesystem, * then allow the specified pid to proceed. */ if (fsckpid > 0) { if (ump->um_fsckpid != 0) { vfs_mount_error(mp, "Active checker already running on %s", fs->fs_fsmnt); return (EINVAL); } KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { vfs_mount_error(mp, "Checker activation failed on %s", fs->fs_fsmnt); return (error); } ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); /* * Must not call namei() while owning busy ref. */ vfs_unbusy(mp); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); error = namei(&ndp); if ((mp->mnt_flag & MNT_UPDATE) != 0) { /* * Unmount does not start if MNT_UPDATE is set. Mount * update busies mp before setting MNT_UPDATE. We * must be able to retain our busy ref succesfully, * without sleep. */ error1 = vfs_busy(mp, MBF_NOWAIT); MPASS(error1 == 0); } if (error != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount_alloc() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } if (fsckpid > 0) { KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { printf("WARNING: %s: Checker activation " "failed\n", fs->fs_fsmnt); } else { ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } } } MNT_ILOCK(mp); /* * This is racy versus lookup, see ufs_fplookup_vexec for details. */ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0) panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp); if ((mp->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS | MNT_UNION)) == 0) mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary * writers, if requested. * 6) invalidate all cached file data. * 7) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, struct thread *td, int flags) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, error; u_long size; int32_t *lp; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Preserve the summary information, read-only status, and * superblock location by copying these fields into our new * superblock before using it to update the existing superblock. */ newfs->fs_si = fs->fs_si; newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); - mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + ump->um_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); free(fs->fs_csp, M_UFSMNT); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); if ((flags & FFSR_UNSUSPEND) != 0) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) { brelse(bp); vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ip->i_effnlink = ip->i_nlink; brelse(bp); vput(vp); } return (0); } /* * Common code for mount and mountroot */ static int ffs_mountfs(odevvp, mp, td) struct vnode *odevvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct fs *fs; struct cdev *dev; int error, i, len, ronly; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; struct vnode *devvp; struct fsfail_task *etp; int candelete, canspeedup; off_t loc; fs = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; devvp = mntfs_allocvp(mp, odevvp); VOP_UNLOCK(odevvp); KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; KASSERT(dev->si_snapdata == NULL, ("non-NULL snapshot data")); if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { mntfs_freevp(devvp); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag |= BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > maxphys) mp->mnt_iosize_max = maxphys; if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } /* fetch the superblock and summary information */ loc = STDSB; if ((mp->mnt_flag & MNT_ROOTFS) != 0) loc = STDSB_NOHASHFAIL; if ((error = ffs_sbget(devvp, &fs, loc, M_UFSMNT, ffs_use_bread)) != 0) goto out; fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck.", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ len = 1024; mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &len, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = fs; if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; if ((mp->mnt_flag & MNT_UNTRUSTED) != 0) ump->um_check_blkno = ffs_check_blkno; else ump->um_check_blkno = NULL; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); ffs_oldfscompat_read(fs, ump, fs->fs_sblockloc); fs->fs_ronly = ronly; fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } - mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + ump->um_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { len = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &len, &candelete) == 0) { if (candelete) ump->um_flags |= UM_CANDELETE; else printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); } if (((ump->um_flags) & UM_CANDELETE) != 0) { ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM, &ump->um_trimlisthashsize); } } len = sizeof(int); if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) { if (canspeedup) ump->um_flags |= UM_CANSPEEDUP; } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_odevvp = odevvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem state information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ etp = malloc(sizeof *ump->um_fsfail_task, M_UFSMNT, M_WAITOK | M_ZERO); etp->fsid = mp->mnt_stat.f_fsid; ump->um_fsfail_task = etp; return (0); out: if (fs != NULL) { free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); } if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump, M_UFSMNT); mp->mnt_data = NULL; } BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); dev_rel(dev); return (error); } /* * A read function for use by filesystem-layer routines. */ static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size) { struct buf *bp; int error; KASSERT(*bufp == NULL, ("ffs_use_bread: non-NULL *bufp %p\n", *bufp)); *bufp = malloc(size, M_UFSMNT, M_WAITOK); if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED, &bp)) != 0) return (error); bcopy(bp->b_data, *bufp, size); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (0); } static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; susp = fs->fs_ronly == 0; #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && !ffs_fsfail_cleanup(ump, error)) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (ffs_fsfail_cleanup(ump, error)) error = 0; if (error != 0 && !ffs_fsfail_cleanup(ump, error)) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); if (ump->um_trim_tq != NULL) { while (ump->um_trim_inflight != 0) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); free (ump->um_trimhash, M_TRIM); } g_topology_lock(); if (ump->um_fsckpid > 0) { /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); ump->um_fsckpid = 0; } g_vfs_close(ump->um_cp); g_topology_unlock(); BO_LOCK(&ump->um_odevvp->v_bufobj); ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&ump->um_odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); mntfs_freevp(ump->um_devvp); vrele(ump->um_odevvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); if (ump->um_fsfail_task != NULL) free(ump->um_fsfail_task, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); if (td->td_su == mp) { td->td_su = NULL; vfs_rel(mp); } return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = UFS_MAXNAMLEN; return (0); } static bool sync_doupdate(struct inode *ip) { return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0); } static int ffs_sync_lazy_filter(struct vnode *vp, void *arg __unused) { struct inode *ip; /* * Flags are safe to access because ->v_data invalidation * is held off by listmtx. */ if (vp->v_type == VNON) return (false); ip = VTOI(vp); if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) return (false); return (true); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(mp) struct mount *mp; { struct vnode *mvp, *vp; struct inode *ip; struct thread *td; int allerror, error; allerror = 0; td = curthread; if ((mp->mnt_flag & MNT_NOATIME) != 0) { #ifdef QUOTA qsync(mp); #endif goto sbupdate; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK)) != 0) continue; #ifdef QUOTA qsyncvp(vp); #endif if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } sbupdate: if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) lockreq = LK_EXCLUSIVE; lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } #ifdef QUOTA qsyncvp(vp); #endif for (;;) { error = ffs_syncvnode(vp, waitfor, 0); if (error == ERELOOKUP) continue; if (error != 0) allerror = error; break; } vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(mp, ino, flags, vpp, ffs_flags) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int ffs_flags; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; daddr_t dbn; int error; MPASS((ffs_flags & (FFSV_REPLACE | FFSV_REPLACE_DOOMED)) == 0 || (flags & LK_EXCLUSIVE) != 0); error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { if ((ffs_flags & FFSV_REPLACE) == 0 || ((ffs_flags & FFSV_REPLACE_DOOMED) == 0 || !VN_IS_DOOMED(*vpp))) return (0); vgone(*vpp); vput(*vpp); } /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); fs = ump->um_fs; ip = uma_zalloc_smr(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree_smr(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; ip->i_mode = 0; /* ensure error cases below throw away vnode */ #ifdef DIAGNOSTIC ufs_init_trackers(ip); #endif #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree_smr(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { /* * Calls from ffs_valloc() (i.e. FFSV_REPLACE set) * operate on empty inode, which must not be found by * other threads until fully filled. Vnode for empty * inode must be not re-inserted on the hash by other * thread, after removal by us at the beginning. */ MPASS((ffs_flags & FFSV_REPLACE) == 0); return (0); } /* Read in the disk contents for the inode, copy into the inode. */ dbn = fsbtodb(fs, ino_to_fsba(fs, ino)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ vgone(vp); vput(vp); *vpp = NULL; return (error); } if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) { bqrelse(bp); vgone(vp); vput(vp); *vpp = NULL; return (error); } if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, &vp); if (error) { vgone(vp); vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { while (ip->i_gen == 0) ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { UFS_INODE_SET_FLAG(ip, IN_MODIFIED); DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vgone(vp); vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ufid *ufhp; ufhp = (struct ufid *)fhp; return (ffs_inotovp(mp, ufhp->ufid_ino, ufhp->ufid_gen, flags, vpp, 0)); } int ffs_inotovp(mp, ino, gen, lflags, vpp, ffs_flags) struct mount *mp; ino_t ino; u_int64_t gen; int lflags; struct vnode **vpp; int ffs_flags; { struct ufsmount *ump; struct vnode *nvp; struct inode *ip; struct fs *fs; struct cg *cgp; struct buf *bp; u_int cg; int error; ump = VFSTOUFS(mp); fs = ump->um_fs; *vpp = NULL; if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); /* * Need to check if inode is initialized because UFS2 does lazy * initialization and nfs_fhtovp can offer arbitrary inode numbers. */ if (fs->fs_magic == FS_UFS2_MAGIC) { cg = ino_to_cg(fs, ino); error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); if (error != 0) return (error); if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { brelse(bp); return (ESTALE); } brelse(bp); } error = ffs_vgetf(mp, ino, lflags, &nvp, ffs_flags); if (error != 0) return (error); ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != gen || ip->i_effnlink <= 0) { if (ip->i_mode == 0) vgone(nvp); vput(nvp); return (ESTALE); } vnode_create_vobject(nvp, DIP(ip, i_size), curthread); *vpp = nvp; return (0); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); taskqueue_drain_all(taskqueue_thread); return (ret); } /* * Structure used to pass information from ffs_sbupdate to its * helper routine ffs_use_bwrite. */ struct devfd { struct ufsmount *ump; struct buf *sbbp; int waitfor; int suspended; int error; }; /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(ump, waitfor, suspended) struct ufsmount *ump; int waitfor; int suspended; { struct fs *fs; struct buf *sbbp; struct devfd devfd; fs = ump->um_fs; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * Initialize info needed for write function. */ devfd.ump = ump; devfd.sbbp = sbbp; devfd.waitfor = waitfor; devfd.suspended = suspended; devfd.error = 0; return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite)); } /* * Write function for use by filesystem-layer routines. */ static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size) { struct devfd *devfdp; struct ufsmount *ump; struct buf *bp; struct fs *fs; int error; devfdp = devfd; ump = devfdp->ump; fs = ump->um_fs; /* * Writing the superblock summary information. */ if (loc != fs->fs_sblockloc) { bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0); bcopy(buf, bp->b_data, (u_int)size); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (0); } /* * Writing the superblock itself. We need to do special checks for it. */ bp = devfdp->sbbp; if (ffs_fsfail_cleanup(ump, devfdp->error)) devfdp->error = 0; if (devfdp->error != 0) { brelse(bp); return (devfdp->error); } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); fs = (struct fs *)bp->b_data; ffs_oldfscompat_write(fs, ump); fs->fs_si = NULL; /* Recalculate the superblock hash */ fs->fs_ckhash = ffs_calc_sbhash(fs); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (devfdp->error); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree_smr(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) != 0) softdep_handle_error(bp); #endif /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * We should mark the cylinder group buffer origbp as * dirty, to not lose the failed write. */ if ((bp->b_ioflags & BIO_ERROR) != 0) origbp->b_vflags |= BV_BKGRDERR; BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. Clear B_IOSTARTED in case of error. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~(B_CACHE | B_IOSTARTED); pbrelvp(bp); /* * Prevent brelse() from trying to keep and re-dirtying bp on * errors. It causes b_bufobj dereference in * bdirty()/reassignbuf(), and b_bufobj was cleared in * pbrelvp() above. */ if ((bp->b_ioflags & BIO_ERROR) != 0) bp->b_flags |= B_INVAL; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; struct cg *cgp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); newbp->b_xflags |= (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. We have to recalculate * its check hash in case the buffer gets freed and then * reconstituted from the buffer cache during a later read. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_ckhash = 0; cgp->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); } bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: /* * If we are writing a cylinder group, update its time. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_old_time = cgp->cg_time = time_second; } return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; struct buf *tbp; int error, nocopy; /* * This is the bufobj strategy for the private VCHR vnodes * used by FFS to access the underlying storage device. * We override the default bufobj strategy and thus bypass * VOP_STRATEGY() for these vnodes. */ vp = bo2vnode(bo); KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR || bp->b_vp->v_rdev == NULL || bp->b_vp->v_rdev->si_mountpt == NULL || VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL || vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp, ("ffs_geom_strategy() with wrong vp")); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bp->b_flags &= ~B_BARRIER; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bp->b_flags &= ~B_BARRIER; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif /* * Check for metadata that needs check-hashes and update them. */ switch (bp->b_xflags & BX_FSPRIV) { case BX_CYLGRP: ((struct cg *)bp->b_data)->cg_ckhash = 0; ((struct cg *)bp->b_data)->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); break; case BX_SUPERBLOCK: case BX_INODE: case BX_INDIR: case BX_DIR: printf("Check-hash write is unimplemented!!!\n"); break; case 0: break; default: printf("multiple buffer types 0x%b\n", (u_int)(bp->b_xflags & BX_FSPRIV), PRINT_UFS_BUF_XFLAGS); break; } } if (bp->b_iocmd != BIO_READ && ffs_enxio_enable) bp->b_xflags |= BX_CVTENXIO; g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 4e4e0b78fe4d..e834a6b4f50a 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -1,2097 +1,2097 @@ /*- * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause) * * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_directio.h" #include "opt_ffs.h" #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #include #include #define ALIGNED_TO(ptr, s) \ (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0) #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif static vop_fdatasync_t ffs_fdatasync; static vop_fsync_t ffs_fsync; static vop_getpages_t ffs_getpages; static vop_getpages_async_t ffs_getpages_async; static vop_lock1_t ffs_lock; #ifdef INVARIANTS static vop_unlock_t ffs_unlock_debug; #endif static vop_read_t ffs_read; static vop_write_t ffs_write; static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); static vop_strategy_t ffsext_strategy; static vop_closeextattr_t ffs_closeextattr; static vop_deleteextattr_t ffs_deleteextattr; static vop_getextattr_t ffs_getextattr; static vop_listextattr_t ffs_listextattr; static vop_openextattr_t ffs_openextattr; static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; static vop_vput_pair_t ffs_vput_pair; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = ffs_getpages_async, .vop_lock1 = ffs_lock, #ifdef INVARIANTS .vop_unlock = ffs_unlock_debug, #endif .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_vptofh = ffs_vptofh, .vop_vput_pair = ffs_vput_pair, }; VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1); struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_lock1 = ffs_lock, #ifdef INVARIANTS .vop_unlock = ffs_unlock_debug, #endif .vop_vptofh = ffs_vptofh, }; VFS_VOP_VECTOR_REGISTER(ffs_fifoops1); /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = ffs_getpages_async, .vop_lock1 = ffs_lock, #ifdef INVARIANTS .vop_unlock = ffs_unlock_debug, #endif .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, .vop_vput_pair = ffs_vput_pair, }; VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2); struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_lock1 = ffs_lock, #ifdef INVARIANTS .vop_unlock = ffs_unlock_debug, #endif .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; VFS_VOP_VECTOR_REGISTER(ffs_fifoops2); /* * Synch an open file. */ /* ARGSUSED */ static int ffs_fsync(struct vop_fsync_args *ap) { struct vnode *vp; struct bufobj *bo; int error; vp = ap->a_vp; bo = &vp->v_bufobj; retry: error = ffs_syncvnode(vp, ap->a_waitfor, 0); if (error) return (error); if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { error = softdep_fsync(vp); if (error) return (error); /* * The softdep_fsync() function may drop vp lock, * allowing for dirty buffers to reappear on the * bo_dirty list. Recheck and resync as needed. */ BO_LOCK(bo); if ((vp->v_type == VREG || vp->v_type == VDIR) && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { BO_UNLOCK(bo); goto retry; } BO_UNLOCK(bo); } if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), 0)) return (ENXIO); return (0); } int ffs_syncvnode(struct vnode *vp, int waitfor, int flags) { struct inode *ip; struct bufobj *bo; struct ufsmount *ump; struct buf *bp, *nbp; ufs_lbn_t lbn; int error, passes; bool still_dirty, unlocked, wait; ip = VTOI(vp); bo = &vp->v_bufobj; ump = VFSTOUFS(vp->v_mount); /* * When doing MNT_WAIT we must first flush all dependencies * on the inode. */ if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && (error = softdep_sync_metadata(vp)) != 0) { if (ffs_fsfail_cleanup(ump, error)) error = 0; return (error); } /* * Flush all dirty buffers associated with a vnode. */ error = 0; passes = 0; wait = false; /* Always do an async pass first. */ unlocked = false; lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); BO_LOCK(bo); loop: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * Reasons to skip this buffer: it has already been considered * on this pass, the buffer has dependencies that will cause * it to be redirtied and it has not already been deferred, * or it is already being written. */ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; /* * Flush indirects in order, if requested. * * Note that if only datasync is requested, we can * skip indirect blocks when softupdates are not * active. Otherwise we must flush them with data, * since dependencies prevent data block writes. */ if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR && (lbn_level(bp->b_lblkno) >= passes || ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) continue; if (bp->b_lblkno > lbn) panic("ffs_syncvnode: syncing truncated data."); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { BO_UNLOCK(bo); } else if (wait) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) != 0) { BO_LOCK(bo); bp->b_vflags &= ~BV_SCANNED; goto next_locked; } } else continue; if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); /* * Check for dependencies and potentially complete them. */ if (!LIST_EMPTY(&bp->b_dep) && (error = softdep_sync_buf(vp, bp, wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { /* * Lock order conflict, buffer was already unlocked, * and vnode possibly unlocked. */ if (error == ERELOOKUP) { if (vp->v_data == NULL) return (EBADF); unlocked = true; if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && (error = softdep_sync_metadata(vp)) != 0) { if (ffs_fsfail_cleanup(ump, error)) error = 0; return (unlocked && error == 0 ? ERELOOKUP : error); } /* Re-evaluate inode size */ lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); goto next; } /* I/O error. */ if (error != EBUSY) { BUF_UNLOCK(bp); return (error); } /* If we deferred once, don't defer again. */ if ((bp->b_flags & B_DEFERRED) == 0) { bp->b_flags |= B_DEFERRED; BUF_UNLOCK(bp); goto next; } } if (wait) { bremfree(bp); error = bwrite(bp); if (ffs_fsfail_cleanup(ump, error)) error = 0; if (error != 0) return (error); } else if ((bp->b_flags & B_CLUSTEROK)) { (void) vfs_bio_awrite(bp); } else { bremfree(bp); (void) bawrite(bp); } next: /* * Since we may have slept during the I/O, we need * to start from a known point. */ BO_LOCK(bo); next_locked: nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); } if (waitfor != MNT_WAIT) { BO_UNLOCK(bo); if ((flags & NO_INO_UPDT) != 0) return (unlocked ? ERELOOKUP : 0); error = ffs_update(vp, 0); if (error == 0 && unlocked) error = ERELOOKUP; return (error); } /* Drain IO to see if we're done. */ bufobj_wwait(bo, 0, 0); /* * Block devices associated with filesystems may have new I/O * requests posted for them even if the vnode is locked, so no * amount of trying will get them clean. We make several passes * as a best effort. * * Regular files may need multiple passes to flush all dependency * work as it is possible that we must write once per indirect * level, once for the leaf, and once for the inode and each of * these will be done with one sync and one async pass. */ if (bo->bo_dirty.bv_cnt > 0) { if ((flags & DATA_ONLY) == 0) { still_dirty = true; } else { /* * For data-only sync, dirty indirect buffers * are ignored. */ still_dirty = false; TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { if (bp->b_lblkno > -UFS_NDADDR) { still_dirty = true; break; } } } if (still_dirty) { /* Write the inode after sync passes to flush deps. */ if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { BO_UNLOCK(bo); ffs_update(vp, 1); BO_LOCK(bo); } /* switch between sync/async. */ wait = !wait; if (wait || ++passes < UFS_NIADDR + 2) goto loop; } } BO_UNLOCK(bo); error = 0; if ((flags & DATA_ONLY) == 0) { if ((flags & NO_INO_UPDT) == 0) error = ffs_update(vp, 1); if (DOINGSUJ(vp)) softdep_journal_fsync(VTOI(vp)); } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) { error = ffs_update(vp, 1); } if (error == 0 && unlocked) error = ERELOOKUP; if (error == 0) ip->i_flag &= ~IN_NEEDSYNC; return (error); } static int ffs_fdatasync(struct vop_fdatasync_args *ap) { return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); } static int ffs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { #if !defined(NO_FFS_SNAPSHOT) || defined(DIAGNOSTIC) struct vnode *vp = ap->a_vp; #endif /* !NO_FFS_SNAPSHOT || DIAGNOSTIC */ #ifdef DIAGNOSTIC struct inode *ip; #endif /* DIAGNOSTIC */ int result; #ifndef NO_FFS_SNAPSHOT int flags; struct lock *lkp; /* * Adaptive spinning mixed with SU leads to trouble. use a giant hammer * and only use it when LK_NODDLKTREAT is set. Currently this means it * is only used during path lookup. */ if ((ap->a_flags & LK_NODDLKTREAT) != 0) ap->a_flags |= LK_ADAPTIVE; switch (ap->a_flags & LK_TYPE_MASK) { case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: flags = ap->a_flags; for (;;) { #ifdef DEBUG_VFS_LOCKS VNPASS(vp->v_holdcnt != 0, vp); #endif /* DEBUG_VFS_LOCKS */ lkp = vp->v_vnlock; result = lockmgr_lock_flags(lkp, flags, &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line); if (lkp == vp->v_vnlock || result != 0) break; /* * Apparent success, except that the vnode * mutated between snapshot file vnode and * regular file vnode while this process * slept. The lock currently held is not the * right lock. Release it, and try to get the * new lock. */ lockmgr_unlock(lkp); if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == (LK_INTERLOCK | LK_NOWAIT)) return (EBUSY); if ((flags & LK_TYPE_MASK) == LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } #ifdef DIAGNOSTIC switch (ap->a_flags & LK_TYPE_MASK) { case LK_UPGRADE: case LK_EXCLUSIVE: if (result == 0 && vp->v_vnlock->lk_recurse == 0) { ip = VTOI(vp); if (ip != NULL) ip->i_lock_gen++; } } #endif /* DIAGNOSTIC */ break; default: #ifdef DIAGNOSTIC if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) { ip = VTOI(vp); if (ip != NULL) ufs_unlock_tracker(ip); } #endif /* DIAGNOSTIC */ result = VOP_LOCK1_APV(&ufs_vnodeops, ap); break; } #else /* NO_FFS_SNAPSHOT */ /* * See above for an explanation. */ if ((ap->a_flags & LK_NODDLKTREAT) != 0) ap->a_flags |= LK_ADAPTIVE; #ifdef DIAGNOSTIC if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) { ip = VTOI(vp); if (ip != NULL) ufs_unlock_tracker(ip); } #endif /* DIAGNOSTIC */ result = VOP_LOCK1_APV(&ufs_vnodeops, ap); #endif /* NO_FFS_SNAPSHOT */ #ifdef DIAGNOSTIC switch (ap->a_flags & LK_TYPE_MASK) { case LK_UPGRADE: case LK_EXCLUSIVE: if (result == 0 && vp->v_vnlock->lk_recurse == 0) { ip = VTOI(vp); if (ip != NULL) ip->i_lock_gen++; } } #endif /* DIAGNOSTIC */ return (result); } #ifdef INVARIANTS static int ffs_unlock_debug(struct vop_unlock_args *ap) { struct vnode *vp; struct inode *ip; vp = ap->a_vp; ip = VTOI(vp); if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) { if ((vp->v_mflag & VMP_LAZYLIST) == 0) { VI_LOCK(vp); VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp, ("%s: modified vnode (%x) not on lazy list", __func__, ip->i_flag)); VI_UNLOCK(vp); } } KASSERT(vp->v_type != VDIR || vp->v_vnlock->lk_recurse != 0 || (ip->i_flag & IN_ENDOFF) == 0, ("ufs dir vp %p ip %p flags %#x", vp, ip, ip->i_flag)); #ifdef DIAGNOSTIC if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL && vp->v_vnlock->lk_recurse == 0) ufs_unlock_tracker(ip); #endif return (VOP_UNLOCK_APV(&ufs_vnodeops, ap)); } #endif static int ffs_read_hole(struct uio *uio, long xfersize, long *size) { ssize_t saved_resid, tlen; int error; while (xfersize > 0) { tlen = min(xfersize, ZERO_REGION_SIZE); saved_resid = uio->uio_resid; error = vn_io_fault_uiomove(__DECONST(void *, zero_region), tlen, uio); if (error != 0) return (error); tlen = saved_resid - uio->uio_resid; xfersize -= tlen; *size -= tlen; } return (0); } /* * Vnode op for reading. */ static int ffs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct inode *ip; struct uio *uio; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int bflag, error, ioflag, seqcount; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extread(vp, uio, ioflag)); #else panic("ffs_read+IO_EXT"); #endif #ifdef DIRECTIO if ((ioflag & IO_DIRECT) != 0) { int workdone; error = ffs_rawread(vp, uio, &workdone); if (error != 0 || workdone != 0) return error; } #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("ffs_read: mode"); if (vp->v_type == VLNK) { - if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) + if ((int)ip->i_size < VFSTOUFS(vp->v_mount)->um_maxsymlinklen) panic("ffs_read: short symlink"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("ffs_read: type %d", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); fs = ITOFS(ip); if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) { /* * Don't do readahead if this is the end of the file. */ error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, * grab as much as we can. * * XXX This may not be a win if we are not * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, bflag, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then * if we appear to be acting sequentially, * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = blksize(fs, ip, nextlbn); error = breadn_flags(vp, lbn, lbn, size, &nextlbn, &nextsize, 1, NOCRED, bflag, NULL, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp); } if (error == EJUSTRETURN) { error = ffs_read_hole(uio, xfersize, &size); if (error == 0) continue; } if (error != 0) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS); return (error); } /* * Vnode op for writing. */ static int ffs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct uio *uio; struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int seqcount; int blkoffset, error, flags, ioflag, size, xfersize; vp = ap->a_vp; if (DOINGSUJ(vp)) softdep_prealloc(vp, MNT_WAIT); if (vp->v_data == NULL) return (EBADF); uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); #else panic("ffs_write+IO_EXT"); #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("ffs_write: mode"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: panic("ffs_write: dir write"); break; default: panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, (int)uio->uio_offset, (int)uio->uio_resid ); } KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); fs = ITOFS(ip); if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if (ioflag & IO_SYNC) flags |= IO_SYNC; flags |= BA_UNMAPPED; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; /* XXX is uio->uio_offset the right thing here? */ error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ap->a_cred, flags, &bp); if (error != 0) { vnode_pager_setsize(vp, ip->i_size); break; } if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; DIP_SET(ip, i_size, ip->i_size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); } size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to UFS_BALLOC() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->fs_bsize == xfersize) vfs_bio_clrbuf(bp); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount, GB_UNMAPPED); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) { vn_seqc_write_begin(vp); UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID)); DIP_SET(ip, i_mode, ip->i_mode); vn_seqc_write_end(vp); } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { if (!(ioflag & IO_DATASYNC) || (ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA))) error = ffs_update(vp, 1); if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error)) error = ENXIO; } return (error); } /* * Extended attribute area reading. */ static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int error; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extread: mode"); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = sblksize(fs, dp->di_extsize, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= dp->di_extsize) { /* * Don't do readahead if this is the end of the info. */ error = bread(vp, -1 - lbn, size, NOCRED, &bp); } else { /* * If we have a second block, then * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); nextlbn = -1 - nextlbn; error = breadn(vp, -1 - lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); return (error); } /* * Extended attribute area writing. */ static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int blkoffset, error, flags, size, xfersize; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extwrite: mode"); #endif if (ioflag & IO_APPEND) uio->uio_offset = dp->di_extsize; KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); if ((uoff_t)uio->uio_offset + uio->uio_resid > UFS_NXADDR * fs->fs_bsize) return (EFBIG); resid = uio->uio_resid; osize = dp->di_extsize; flags = IO_EXT; if (ioflag & IO_SYNC) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ucred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (uio->uio_offset + xfersize > dp->di_extsize) { dp->di_extsize = uio->uio_offset + xfersize; UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); } size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || xfersize + blkoffset == fs->fs_bsize || (ioflag & (IO_ASYNC | IO_DIRECT))) bawrite(bp); else bdwrite(bp); if (error || xfersize == 0) break; UFS_INODE_SET_FLAG(ip, IN_CHANGE); } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) { vn_seqc_write_begin(vp); UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID)); dp->di_mode = ip->i_mode; vn_seqc_write_end(vp); } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_EXT | (ioflag&IO_SYNC), ucred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Vnode operating to retrieve a named extended attribute. * * Locate a particular EA (nspace:name) in the area (ptr:length), and return * the length of the EA, and possibly the pointer to the entry and to the data. */ static int ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, struct extattr **eapp, u_char **eac) { struct extattr *eap, *eaend; size_t nlen; nlen = strlen(name); KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned")); eap = (struct extattr *)ptr; eaend = (struct extattr *)(ptr + length); for (; eap < eaend; eap = EXTATTR_NEXT(eap)) { KASSERT(EXTATTR_NEXT(eap) <= eaend, ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend)); if (eap->ea_namespace != nspace || eap->ea_namelength != nlen || memcmp(eap->ea_name, name, nlen) != 0) continue; if (eapp != NULL) *eapp = eap; if (eac != NULL) *eac = EXTATTR_CONTENT(eap); return (EXTATTR_CONTENT_SIZE(eap)); } return (-1); } static int ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td) { const struct extattr *eap, *eaend, *eapnext; struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct uio luio; struct iovec liovec; u_int easize; int error; u_char *eae; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; easize = dp->di_extsize; if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize) return (EFBIG); eae = malloc(easize, M_TEMP, M_WAITOK); liovec.iov_base = eae; liovec.iov_len = easize; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = easize; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_READ; luio.uio_td = td; error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); if (error) { free(eae, M_TEMP); return (error); } /* Validate disk xattrfile contents. */ for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend; eap = eapnext) { /* Detect zeroed out tail */ if (eap->ea_length < sizeof(*eap) || eap->ea_length == 0) { easize = (const u_char *)eap - eae; break; } eapnext = EXTATTR_NEXT(eap); /* Bogusly long entry. */ if (eapnext > eaend) { free(eae, M_TEMP); return (EINTEGRITY); } } ip->i_ea_len = easize; *p = eae; return (0); } static void ffs_lock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); while (ip->i_flag & IN_EA_LOCKED) { UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT); msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 0); } UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED); VI_UNLOCK(vp); } static void ffs_unlock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); if (ip->i_flag & IN_EA_LOCKWAIT) wakeup(&ip->i_ea_refs); ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); VI_UNLOCK(vp); } static int ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) { struct inode *ip; struct ufs2_dinode *dp; int error; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area != NULL) { ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } dp = ip->i_din2; error = ffs_rdextattr(&ip->i_ea_area, vp, td); if (error) { ffs_unlock_ea(vp); return (error); } ip->i_ea_error = 0; ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } /* * Vnode extattr transaction commit/abort */ static int ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) { struct inode *ip; struct uio luio; struct iovec *liovec; struct ufs2_dinode *dp; size_t ea_len, tlen; int error, i, lcnt; bool truncate; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area == NULL) { ffs_unlock_ea(vp); return (EINVAL); } dp = ip->i_din2; error = ip->i_ea_error; truncate = false; if (commit && error == 0) { ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); if (cred == NOCRED) cred = vp->v_mount->mnt_cred; ea_len = MAX(ip->i_ea_len, dp->di_extsize); for (lcnt = 1, tlen = ea_len - ip->i_ea_len; tlen > 0;) { tlen -= MIN(ZERO_REGION_SIZE, tlen); lcnt++; } liovec = __builtin_alloca(lcnt * sizeof(struct iovec)); luio.uio_iovcnt = lcnt; liovec[0].iov_base = ip->i_ea_area; liovec[0].iov_len = ip->i_ea_len; for (i = 1, tlen = ea_len - ip->i_ea_len; i < lcnt; i++) { liovec[i].iov_base = __DECONST(void *, zero_region); liovec[i].iov_len = MIN(ZERO_REGION_SIZE, tlen); tlen -= liovec[i].iov_len; } MPASS(tlen == 0); luio.uio_iov = liovec; luio.uio_offset = 0; luio.uio_resid = ea_len; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_WRITE; luio.uio_td = td; error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); if (error == 0 && ip->i_ea_len == 0) truncate = true; } if (--ip->i_ea_refs == 0) { free(ip->i_ea_area, M_TEMP); ip->i_ea_area = NULL; ip->i_ea_len = 0; ip->i_ea_error = 0; } ffs_unlock_ea(vp); if (truncate) ffs_truncate(vp, 0, IO_EXT, cred); return (error); } /* * Vnode extattr strategy routine for fifos. * * We need to check for a read or write of the external attributes. * Otherwise we just fall through and do the usual thing. */ static int ffsext_strategy(struct vop_strategy_args *ap) /* struct vop_strategy_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct buf *a_bp; }; */ { struct vnode *vp; daddr_t lbn; vp = ap->a_vp; lbn = ap->a_bp->b_lblkno; if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR) return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); if (vp->v_type == VFIFO) return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); panic("spec nodes went here"); } /* * Vnode extattr transaction commit/abort */ static int ffs_openextattr(struct vop_openextattr_args *ap) /* struct vop_openextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); } /* * Vnode extattr transaction commit/abort */ static int ffs_closeextattr(struct vop_closeextattr_args *ap) /* struct vop_closeextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_commit; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct vnode *vp; vp = ap->a_vp; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); if (ap->a_commit && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) return (EROFS); if (ap->a_commit && DOINGSUJ(vp)) { ASSERT_VOP_ELOCKED(vp, "ffs_closeextattr commit"); softdep_prealloc(vp, MNT_WAIT); if (vp->v_data == NULL) return (EBADF); } return (ffs_close_ea(vp, ap->a_commit, ap->a_cred, ap->a_td)); } /* * Vnode operation to remove a named attribute. */ static int ffs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct vnode *vp; struct inode *ip; struct extattr *eap; uint32_t ul; int olen, error, i, easize; u_char *eae; void *tmp; vp = ap->a_vp; ip = VTOI(vp); if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } if (DOINGSUJ(vp)) { ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr"); softdep_prealloc(vp, MNT_WAIT); if (vp->v_data == NULL) return (EBADF); } error = ffs_open_ea(vp, ap->a_cred, ap->a_td); if (error) return (error); /* CEM: delete could be done in-place instead */ eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &eap, NULL); if (olen == -1) { /* delete but nonexistent */ free(eae, M_TEMP); ffs_close_ea(vp, 0, ap->a_cred, ap->a_td); return (ENOATTR); } ul = eap->ea_length; i = (u_char *)EXTATTR_NEXT(eap) - eae; bcopy(EXTATTR_NEXT(eap), eap, easize - i); easize -= ul; tmp = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(tmp, M_TEMP); error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ffs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; u_char *eae, *p; unsigned easize; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); eae = ip->i_ea_area; easize = ip->i_ea_len; ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, NULL, &p); if (ealen >= 0) { error = 0; if (ap->a_size != NULL) *ap->a_size = ealen; else if (ap->a_uio != NULL) error = uiomove(p, ealen, ap->a_uio); } else error = ENOATTR; ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ffs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct extattr *eap, *eaend; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); error = 0; if (ap->a_size != NULL) *ap->a_size = 0; KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned")); eap = (struct extattr *)ip->i_ea_area; eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len); for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) { KASSERT(EXTATTR_NEXT(eap) <= eaend, ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend)); if (eap->ea_namespace != ap->a_attrnamespace) continue; ealen = eap->ea_namelength; if (ap->a_size != NULL) *ap->a_size += ealen + 1; else if (ap->a_uio != NULL) error = uiomove(&eap->ea_namelength, ealen + 1, ap->a_uio); } ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to set a named attribute. */ static int ffs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct vnode *vp; struct inode *ip; struct fs *fs; struct extattr *eap; uint32_t ealength, ul; ssize_t ealen; int olen, eapad1, eapad2, error, i, easize; u_char *eae; void *tmp; vp = ap->a_vp; ip = VTOI(vp); fs = ITOFS(ip); if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); /* XXX Now unsupported API to delete EAs using NULL uio. */ if (ap->a_uio == NULL) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); ealen = ap->a_uio->uio_resid; if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR)) return (EINVAL); error = extattr_check_cred(vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } if (DOINGSUJ(vp)) { ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr"); softdep_prealloc(vp, MNT_WAIT); if (vp->v_data == NULL) return (EBADF); } error = ffs_open_ea(vp, ap->a_cred, ap->a_td); if (error) return (error); ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); eapad1 = roundup2(ealength, 8) - ealength; eapad2 = roundup2(ealen, 8) - ealen; ealength += eapad1 + ealen + eapad2; /* * CEM: rewrites of the same size or smaller could be done in-place * instead. (We don't acquire any fine-grained locks in here either, * so we could also do bigger writes in-place.) */ eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &eap, NULL); if (olen == -1) { /* new, append at end */ KASSERT(ALIGNED_TO(eae + easize, struct extattr), ("unaligned")); eap = (struct extattr *)(eae + easize); easize += ealength; } else { ul = eap->ea_length; i = (u_char *)EXTATTR_NEXT(eap) - eae; if (ul != ealength) { bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength, easize - i); easize += (ealength - ul); } } if (easize > lblktosize(fs, UFS_NXADDR)) { free(eae, M_TEMP); ffs_close_ea(vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return (ENOSPC); } eap->ea_length = ealength; eap->ea_namespace = ap->a_attrnamespace; eap->ea_contentpadlen = eapad2; eap->ea_namelength = strlen(ap->a_name); memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name)); bzero(&eap->ea_name[strlen(ap->a_name)], eapad1); error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio); if (error) { free(eae, M_TEMP); ffs_close_ea(vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2); tmp = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(tmp, M_TEMP); error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td); return (error); } /* * Vnode pointer to File handle */ static int ffs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } SYSCTL_DECL(_vfs_ffs); static int use_buf_pager = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, "Always use buffer pager instead of bmap"); static daddr_t ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); } static int ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) { return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); } static int ffs_getpages(struct vop_getpages_args *ap) { struct vnode *vp; struct ufsmount *um; vp = ap->a_vp; um = VFSTOUFS(vp->v_mount); if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); } static int ffs_getpages_async(struct vop_getpages_async_args *ap) { struct vnode *vp; struct ufsmount *um; bool do_iodone; int error; vp = ap->a_vp; um = VFSTOUFS(vp->v_mount); do_iodone = true; if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) { error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg); if (error == 0) do_iodone = false; } else { error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz); } if (do_iodone && ap->a_iodone != NULL) ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } static int ffs_vput_pair(struct vop_vput_pair_args *ap) { struct mount *mp; struct vnode *dvp, *vp, *vp1, **vpp; struct inode *dp, *ip; ino_t ip_ino; u_int64_t ip_gen; off_t old_size; int error, vp_locked; dvp = ap->a_dvp; dp = VTOI(dvp); vpp = ap->a_vpp; vp = vpp != NULL ? *vpp : NULL; if ((dp->i_flag & (IN_NEEDSYNC | IN_ENDOFF)) == 0) { vput(dvp); if (vp != NULL && ap->a_unlock_vp) vput(vp); return (0); } mp = dvp->v_mount; if (vp != NULL) { if (ap->a_unlock_vp) { vput(vp); } else { MPASS(vp->v_type != VNON); vp_locked = VOP_ISLOCKED(vp); ip = VTOI(vp); ip_ino = ip->i_number; ip_gen = ip->i_gen; VOP_UNLOCK(vp); } } /* * If compaction or fsync was requested do it in ffs_vput_pair() * now that other locks are no longer held. */ if ((dp->i_flag & IN_ENDOFF) != 0) { VNASSERT(I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size, dvp, ("IN_ENDOFF set but I_ENDOFF() is not")); dp->i_flag &= ~IN_ENDOFF; old_size = dp->i_size; error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL | (DOINGASYNC(dvp) ? 0 : IO_SYNC), curthread->td_ucred); if (error != 0 && error != ERELOOKUP) { if (!ffs_fsfail_cleanup(VFSTOUFS(mp), error)) { vn_printf(dvp, "IN_ENDOFF: failed to truncate, " "error %d\n", error); } #ifdef UFS_DIRHASH ufsdirhash_free(dp); #endif } SET_I_ENDOFF(dp, 0); } if ((dp->i_flag & IN_NEEDSYNC) != 0) { do { error = ffs_syncvnode(dvp, MNT_WAIT, 0); } while (error == ERELOOKUP); } vput(dvp); if (vp == NULL || ap->a_unlock_vp) return (0); MPASS(mp != NULL); /* * It is possible that vp is reclaimed at this point. Only * routines that call us with a_unlock_vp == false can find * that their vp has been reclaimed. There are three areas * that are affected: * 1) vn_open_cred() - later VOPs could fail, but * dead_open() returns 0 to simulate successful open. * 2) ffs_snapshot() - creation of snapshot fails with EBADF. * 3) NFS server (several places) - code is prepared to detect * and respond to dead vnodes by returning ESTALE. */ VOP_LOCK(vp, vp_locked | LK_RETRY); if (!VN_IS_DOOMED(vp)) return (0); /* * Try harder to recover from reclaimed vp if reclaim was not * because underlying inode was cleared. We saved inode * number and inode generation, so we can try to reinstantiate * exactly same version of inode. If this fails, return * original doomed vnode and let caller to handle * consequences. * * Note that callers must keep write started around * VOP_VPUT_PAIR() calls, so it is safe to use mp without * busying it. */ VOP_UNLOCK(vp); error = ffs_inotovp(mp, ip_ino, ip_gen, LK_EXCLUSIVE, &vp1, FFSV_REPLACE_DOOMED); if (error != 0) { VOP_LOCK(vp, vp_locked | LK_RETRY); } else { vrele(vp); *vpp = vp1; } return (error); } diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c index d1e1bed0bde4..8981ffdfc269 100644 --- a/sys/ufs/ufs/ufs_dirhash.c +++ b/sys/ufs/ufs/ufs_dirhash.c @@ -1,1327 +1,1326 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This implements a hash-based lookup scheme for UFS directories. */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #ifdef UFS_DIRHASH #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) #define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) -#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) #define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables"); static int ufs_mindirhashsize = DIRBLKSIZ * 5; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, &ufs_mindirhashsize, 0, "minimum directory size in bytes for which to use hashed lookup"); static int ufs_dirhashmaxmem = 2 * 1024 * 1024; /* NOTE: initial value. It is tuned in ufsdirhash_init() */ SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem, 0, "maximum allowed dirhash memory usage"); static int ufs_dirhashmem; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem, 0, "current dirhash memory usage"); static int ufs_dirhashcheck = 0; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck, 0, "enable extra sanity tests"); static int ufs_dirhashlowmemcount = 0; SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_lowmemcount, CTLFLAG_RD, &ufs_dirhashlowmemcount, 0, "number of times low memory hook called"); static int ufs_dirhashreclaimpercent = 10; static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_ufs, OID_AUTO, dirhash_reclaimpercent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, ufsdirhash_set_reclaimpercent, "I", "set percentage of dirhash cache to be removed in low VM events"); static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen); static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff); static void ufsdirhash_delslot(struct dirhash *dh, int slot); static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset); static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset); static int ufsdirhash_recycle(int wanted); static void ufsdirhash_lowmem(void); static void ufsdirhash_free_locked(struct inode *ip); static uma_zone_t ufsdirhash_zone; #define DIRHASHLIST_LOCK() mtx_lock(&ufsdirhash_mtx) #define DIRHASHLIST_UNLOCK() mtx_unlock(&ufsdirhash_mtx) #define DIRHASH_BLKALLOC() uma_zalloc(ufsdirhash_zone, M_NOWAIT) #define DIRHASH_BLKFREE(ptr) uma_zfree(ufsdirhash_zone, (ptr)) #define DIRHASH_ASSERT_LOCKED(dh) \ sx_assert(&(dh)->dh_lock, SA_LOCKED) /* Dirhash list; recently-used entries are near the tail. */ static TAILQ_HEAD(, dirhash) ufsdirhash_list; /* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ static struct mtx ufsdirhash_mtx; /* * Locking: * * The relationship between inode and dirhash is protected either by an * exclusive vnode lock or the vnode interlock where a shared vnode lock * may be used. The dirhash_mtx is acquired after the dirhash lock. To * handle teardown races, code wishing to lock the dirhash for an inode * when using a shared vnode lock must obtain a private reference on the * dirhash while holding the vnode interlock. They can drop it once they * have obtained the dirhash lock and verified that the dirhash wasn't * recycled while they waited for the dirhash lock. * * ufsdirhash_build() acquires a shared lock on the dirhash when it is * successful. This lock is released after a call to ufsdirhash_lookup(). * * Functions requiring exclusive access use ufsdirhash_acquire() which may * free a dirhash structure that was recycled by ufsdirhash_recycle(). * * The dirhash lock may be held across io operations. * * WITNESS reports a lock order reversal between the "bufwait" lock * and the "dirhash" lock. However, this specific reversal will not * cause a deadlock. To get a deadlock, one would have to lock a * buffer followed by the dirhash while a second thread locked a * buffer while holding the dirhash lock. The second order can happen * under a shared or exclusive vnode lock for the associated directory * in lookup(). The first order, however, can only happen under an * exclusive vnode lock (e.g. unlink(), rename(), etc.). Thus, for * a thread to be doing a "bufwait" -> "dirhash" order, it has to hold * an exclusive vnode lock. That exclusive vnode lock will prevent * any other threads from doing a "dirhash" -> "bufwait" order. */ static void ufsdirhash_hold(struct dirhash *dh) { refcount_acquire(&dh->dh_refcount); } static void ufsdirhash_drop(struct dirhash *dh) { if (refcount_release(&dh->dh_refcount)) { sx_destroy(&dh->dh_lock); free(dh, M_DIRHASH); } } /* * Release the lock on a dirhash. */ static void ufsdirhash_release(struct dirhash *dh) { sx_unlock(&dh->dh_lock); } /* * Either acquire an existing hash locked shared or create a new hash and * return it exclusively locked. May return NULL if the allocation fails. * * The vnode interlock is used to protect the i_dirhash pointer from * simultaneous access while only a shared vnode lock is held. */ static struct dirhash * ufsdirhash_create(struct inode *ip) { struct dirhash *ndh; struct dirhash *dh; struct vnode *vp; bool excl; ndh = dh = NULL; vp = ip->i_vnode; excl = false; for (;;) { /* Racy check for i_dirhash to prefetch a dirhash structure. */ if (ip->i_dirhash == NULL && ndh == NULL) { ndh = malloc(sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO); if (ndh == NULL) return (NULL); refcount_init(&ndh->dh_refcount, 1); /* * The DUPOK is to prevent warnings from the * sx_slock() a few lines down which is safe * since the duplicate lock in that case is * the one for this dirhash we are creating * now which has no external references until * after this function returns. */ sx_init_flags(&ndh->dh_lock, "dirhash", SX_DUPOK); sx_xlock(&ndh->dh_lock); } /* * Check i_dirhash. If it's NULL just try to use a * preallocated structure. If none exists loop and try again. */ VI_LOCK(vp); dh = ip->i_dirhash; if (dh == NULL) { ip->i_dirhash = ndh; VI_UNLOCK(vp); if (ndh == NULL) continue; return (ndh); } ufsdirhash_hold(dh); VI_UNLOCK(vp); /* Acquire a lock on existing hashes. */ if (excl) sx_xlock(&dh->dh_lock); else sx_slock(&dh->dh_lock); /* The hash could've been recycled while we were waiting. */ VI_LOCK(vp); if (ip->i_dirhash != dh) { VI_UNLOCK(vp); ufsdirhash_release(dh); ufsdirhash_drop(dh); continue; } VI_UNLOCK(vp); ufsdirhash_drop(dh); /* If the hash is still valid we've succeeded. */ if (dh->dh_hash != NULL) break; /* * If the hash is NULL it has been recycled. Try to upgrade * so we can recreate it. If we fail the upgrade, drop our * lock and try again. */ if (excl || sx_try_upgrade(&dh->dh_lock)) break; sx_sunlock(&dh->dh_lock); excl = true; } /* Free the preallocated structure if it was not necessary. */ if (ndh) { ufsdirhash_release(ndh); ufsdirhash_drop(ndh); } return (dh); } /* * Acquire an exclusive lock on an existing hash. Requires an exclusive * vnode lock to protect the i_dirhash pointer. hashes that have been * recycled are reclaimed here and NULL is returned. */ static struct dirhash * ufsdirhash_acquire(struct inode *ip) { struct dirhash *dh; ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__); dh = ip->i_dirhash; if (dh == NULL) return (NULL); sx_xlock(&dh->dh_lock); if (dh->dh_hash != NULL) return (dh); ufsdirhash_free_locked(ip); return (NULL); } /* * Acquire exclusively and free the hash pointed to by ip. Works with a * shared or exclusive vnode lock. */ void ufsdirhash_free(struct inode *ip) { struct dirhash *dh; struct vnode *vp; vp = ip->i_vnode; for (;;) { /* Grab a reference on this inode's dirhash if it has one. */ VI_LOCK(vp); dh = ip->i_dirhash; if (dh == NULL) { VI_UNLOCK(vp); return; } ufsdirhash_hold(dh); VI_UNLOCK(vp); /* Exclusively lock the dirhash. */ sx_xlock(&dh->dh_lock); /* If this dirhash still belongs to this inode, then free it. */ VI_LOCK(vp); if (ip->i_dirhash == dh) { VI_UNLOCK(vp); ufsdirhash_drop(dh); break; } VI_UNLOCK(vp); /* * This inode's dirhash has changed while we were * waiting for the dirhash lock, so try again. */ ufsdirhash_release(dh); ufsdirhash_drop(dh); } ufsdirhash_free_locked(ip); } /* * Attempt to build up a hash table for the directory contents in * inode 'ip'. Returns 0 on success, or -1 of the operation failed. */ int ufsdirhash_build(struct inode *ip) { struct dirhash *dh; struct buf *bp = NULL; struct direct *ep; struct vnode *vp; doff_t bmask, pos; u_int dirblocks, i, narrays, nblocks, nslots; int j, memreqd, slot; /* Take care of a decreased sysctl value. */ while (ufs_dirhashmem > ufs_dirhashmaxmem) { if (ufsdirhash_recycle(0) != 0) return (-1); /* Recycled enough memory, so unlock the list. */ DIRHASHLIST_UNLOCK(); } /* Check if we can/should use dirhash. */ if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) || ip->i_effnlink == 0) { if (ip->i_dirhash) ufsdirhash_free(ip); return (-1); } dh = ufsdirhash_create(ip); if (dh == NULL) return (-1); if (dh->dh_hash != NULL) return (0); vp = ip->i_vnode; /* Allocate 50% more entries than this dir size could ever need. */ KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size")); nslots = ip->i_size / DIRECTSIZ(1); nslots = (nslots * 3 + 1) / 2; narrays = howmany(nslots, DH_NBLKOFF); nslots = narrays * DH_NBLKOFF; dirblocks = howmany(ip->i_size, DIRBLKSIZ); nblocks = (dirblocks * 3 + 1) / 2; memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + nblocks * sizeof(*dh->dh_blkfree); DIRHASHLIST_LOCK(); if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) { DIRHASHLIST_UNLOCK(); if (memreqd > ufs_dirhashmaxmem / 2) goto fail; /* Try to free some space. */ if (ufsdirhash_recycle(memreqd) != 0) goto fail; /* Enough was freed, and list has been locked. */ } ufs_dirhashmem += memreqd; DIRHASHLIST_UNLOCK(); /* Initialise the hash table and block statistics. */ dh->dh_memreq = memreqd; dh->dh_narrays = narrays; dh->dh_hlen = nslots; dh->dh_nblk = nblocks; dh->dh_dirblks = dirblocks; for (i = 0; i < DH_NFSTATS; i++) dh->dh_firstfree[i] = -1; dh->dh_firstfree[DH_NFSTATS] = 0; dh->dh_hused = 0; dh->dh_seqoff = -1; dh->dh_score = DH_SCOREINIT; dh->dh_lastused = time_second; /* * Use non-blocking mallocs so that we will revert to a linear * lookup on failure rather than potentially blocking forever. */ dh->dh_hash = malloc(narrays * sizeof(dh->dh_hash[0]), M_DIRHASH, M_NOWAIT | M_ZERO); if (dh->dh_hash == NULL) goto fail; dh->dh_blkfree = malloc(nblocks * sizeof(dh->dh_blkfree[0]), M_DIRHASH, M_NOWAIT); if (dh->dh_blkfree == NULL) goto fail; for (i = 0; i < narrays; i++) { if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL) goto fail; for (j = 0; j < DH_NBLKOFF; j++) dh->dh_hash[i][j] = DIRHASH_EMPTY; } for (i = 0; i < dirblocks; i++) dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN; bmask = vp->v_mount->mnt_stat.f_iosize - 1; pos = 0; while (pos < ip->i_size) { /* If necessary, get the next directory block. */ if ((pos & bmask) == 0) { if (bp != NULL) brelse(bp); if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0) goto fail; } /* Add this entry to the hash. */ ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); if (ep->d_reclen == 0 || ep->d_reclen > DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) { /* Corrupted directory. */ brelse(bp); goto fail; } if (ep->d_ino != 0) { /* Add the entry (simplified ufsdirhash_add). */ slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); dh->dh_hused++; DH_ENTRY(dh, slot) = pos; ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep)); } pos += ep->d_reclen; } if (bp != NULL) brelse(bp); DIRHASHLIST_LOCK(); TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 1; DIRHASHLIST_UNLOCK(); sx_downgrade(&dh->dh_lock); return (0); fail: ufsdirhash_free_locked(ip); return (-1); } /* * Free any hash table associated with inode 'ip'. */ static void ufsdirhash_free_locked(struct inode *ip) { struct dirhash *dh; struct vnode *vp; int i; DIRHASH_ASSERT_LOCKED(ip->i_dirhash); /* * Clear the pointer in the inode to prevent new threads from * finding the dead structure. */ vp = ip->i_vnode; VI_LOCK(vp); dh = ip->i_dirhash; ip->i_dirhash = NULL; VI_UNLOCK(vp); /* * Remove the hash from the list since we are going to free its * memory. */ DIRHASHLIST_LOCK(); if (dh->dh_onlist) TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); ufs_dirhashmem -= dh->dh_memreq; DIRHASHLIST_UNLOCK(); /* * At this point, any waiters for the lock should hold their * own reference on the dirhash structure. They will drop * that reference once they grab the vnode interlock and see * that ip->i_dirhash is NULL. */ sx_xunlock(&dh->dh_lock); /* * Handle partially recycled as well as fully constructed hashes. */ if (dh->dh_hash != NULL) { for (i = 0; i < dh->dh_narrays; i++) if (dh->dh_hash[i] != NULL) DIRHASH_BLKFREE(dh->dh_hash[i]); free(dh->dh_hash, M_DIRHASH); if (dh->dh_blkfree != NULL) free(dh->dh_blkfree, M_DIRHASH); } /* * Drop the inode's reference to the data structure. */ ufsdirhash_drop(dh); } /* * Find the offset of the specified name within the given inode. * Returns 0 on success, ENOENT if the entry does not exist, or * EJUSTRETURN if the caller should revert to a linear search. * * If successful, the directory offset is stored in *offp, and a * pointer to a struct buf containing the entry is stored in *bpp. If * prevoffp is non-NULL, the offset of the previous entry within * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry * is the first in a block, the start of the block is used). * * Must be called with the hash locked. Returns with the hash unlocked. */ int ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp, struct buf **bpp, doff_t *prevoffp) { struct dirhash *dh, *dh_next; struct direct *dp; struct vnode *vp; struct buf *bp; doff_t blkoff, bmask, offset, prevoff, seqoff; int i, slot; int error; dh = ip->i_dirhash; KASSERT(dh != NULL && dh->dh_hash != NULL, ("ufsdirhash_lookup: Invalid dirhash %p\n", dh)); DIRHASH_ASSERT_LOCKED(dh); /* * Move this dirhash towards the end of the list if it has a * score higher than the next entry, and acquire the dh_lock. */ DIRHASHLIST_LOCK(); if (TAILQ_NEXT(dh, dh_list) != NULL) { /* * If the new score will be greater than that of the next * entry, then move this entry past it. With both mutexes * held, dh_next won't go away, but its dh_score could * change; that's not important since it is just a hint. */ if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && dh->dh_score >= dh_next->dh_score) { KASSERT(dh->dh_onlist, ("dirhash: not on list")); TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, dh_list); } } /* Update the score. */ if (dh->dh_score < DH_SCOREMAX) dh->dh_score++; /* Update last used time. */ dh->dh_lastused = time_second; DIRHASHLIST_UNLOCK(); vp = ip->i_vnode; bmask = vp->v_mount->mnt_stat.f_iosize - 1; blkoff = -1; bp = NULL; seqoff = dh->dh_seqoff; restart: slot = ufsdirhash_hash(dh, name, namelen); if (seqoff != -1) { /* * Sequential access optimisation. seqoff contains the * offset of the directory entry immediately following * the last entry that was looked up. Check if this offset * appears in the hash chain for the name we are looking for. */ for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; i = WRAPINCR(i, dh->dh_hlen)) if (offset == seqoff) break; if (offset == seqoff) { /* * We found an entry with the expected offset. This * is probably the entry we want, but if not, the * code below will retry. */ slot = i; } else seqoff = -1; } for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; slot = WRAPINCR(slot, dh->dh_hlen)) { if (offset == DIRHASH_DEL) continue; if (offset < 0 || offset >= ip->i_size) panic("ufsdirhash_lookup: bad offset in hash array"); if ((offset & ~bmask) != blkoff) { if (bp != NULL) brelse(bp); blkoff = offset & ~bmask; if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) { error = EJUSTRETURN; goto fail; } } KASSERT(bp != NULL, ("no buffer allocated")); dp = (struct direct *)(bp->b_data + (offset & bmask)); if (dp->d_reclen == 0 || dp->d_reclen > DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) { /* Corrupted directory. */ error = EJUSTRETURN; goto fail; } if (dp->d_namlen == namelen && bcmp(dp->d_name, name, namelen) == 0) { /* Found. Get the prev offset if needed. */ if (prevoffp != NULL) { if (offset & (DIRBLKSIZ - 1)) { prevoff = ufsdirhash_getprev(dp, offset); if (prevoff == -1) { error = EJUSTRETURN; goto fail; } } else prevoff = offset; *prevoffp = prevoff; } /* Update offset. */ dh->dh_seqoff = offset + DIRSIZ(0, dp); *bpp = bp; *offp = offset; ufsdirhash_release(dh); return (0); } /* * When the name doesn't match in the sequential * optimization case, go back and search normally. */ if (seqoff != -1) { seqoff = -1; goto restart; } } error = ENOENT; fail: ufsdirhash_release(dh); if (bp != NULL) brelse(bp); return (error); } /* * Find a directory block with room for 'slotneeded' bytes. Returns * the offset of the directory entry that begins the free space. * This will either be the offset of an existing entry that has free * space at the end, or the offset of an entry with d_ino == 0 at * the start of a DIRBLKSIZ block. * * To use the space, the caller may need to compact existing entries in * the directory. The total number of bytes in all of the entries involved * in the compaction is stored in *slotsize. In other words, all of * the entries that must be compacted are exactly contained in the * region beginning at the returned offset and spanning *slotsize bytes. * * Returns -1 if no space was found, indicating that the directory * must be extended. */ doff_t ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) { struct direct *dp; struct dirhash *dh; struct buf *bp; doff_t pos, slotstart; int dirblock, error, freebytes, i; dh = ip->i_dirhash; KASSERT(dh != NULL && dh->dh_hash != NULL, ("ufsdirhash_findfree: Invalid dirhash %p\n", dh)); DIRHASH_ASSERT_LOCKED(dh); /* Find a directory block with the desired free space. */ dirblock = -1; for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) if ((dirblock = dh->dh_firstfree[i]) != -1) break; if (dirblock == -1) return (-1); KASSERT(dirblock < dh->dh_nblk && dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN), ("ufsdirhash_findfree: bad stats")); pos = dirblock * DIRBLKSIZ; error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp); if (error) return (-1); /* Find the first entry with free space. */ for (i = 0; i < DIRBLKSIZ; ) { if (dp->d_reclen == 0) { brelse(bp); return (-1); } if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp)) break; i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > DIRBLKSIZ) { brelse(bp); return (-1); } slotstart = pos + i; /* Find the range of entries needed to get enough space */ freebytes = 0; while (i < DIRBLKSIZ && freebytes < slotneeded) { freebytes += dp->d_reclen; if (dp->d_ino != 0) freebytes -= DIRSIZ(0, dp); if (dp->d_reclen == 0) { brelse(bp); return (-1); } i += dp->d_reclen; dp = (struct direct *)((char *)dp + dp->d_reclen); } if (i > DIRBLKSIZ) { brelse(bp); return (-1); } if (freebytes < slotneeded) panic("ufsdirhash_findfree: free mismatch"); brelse(bp); *slotsize = pos + i - slotstart; return (slotstart); } /* * Return the start of the unused space at the end of a directory, or * -1 if there are no trailing unused blocks. */ doff_t ufsdirhash_enduseful(struct inode *ip) { struct dirhash *dh; int i; dh = ip->i_dirhash; DIRHASH_ASSERT_LOCKED(dh); KASSERT(dh != NULL && dh->dh_hash != NULL, ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh)); if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) return (-1); for (i = dh->dh_dirblks - 1; i >= 0; i--) if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) break; return ((doff_t)(i + 1) * DIRBLKSIZ); } /* * Insert information into the hash about a new directory entry. dirp * points to a struct direct containing the entry, and offset specifies * the offset of this entry. */ void ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_add: bad offset")); /* * Normal hash usage is < 66%. If the usage gets too high then * remove the hash entirely and let it be rebuilt later. */ if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { ufsdirhash_free_locked(ip); return; } /* Find a free hash slot (empty or deleted), and add the entry. */ slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); while (DH_ENTRY(dh, slot) >= 0) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) dh->dh_hused++; DH_ENTRY(dh, slot) = offset; /* Update last used time. */ dh->dh_lastused = time_second; /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp)); ufsdirhash_release(dh); } /* * Remove the specified directory entry from the hash. The entry to remove * is defined by the name in `dirp', which must exist at the specified * `offset' within the directory. */ void ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) { struct dirhash *dh; int slot; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_remove: bad offset")); /* Find the entry */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); /* Remove the hash entry. */ ufsdirhash_delslot(dh, slot); /* Update the per-block summary info. */ ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp)); ufsdirhash_release(dh); } /* * Change the offset associated with a directory entry in the hash. Used * when compacting directory blocks. */ void ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, doff_t newoff) { struct dirhash *dh; int slot; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ && newoff < dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_move: bad offset")); /* Find the entry, and update the offset. */ slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); DH_ENTRY(dh, slot) = newoff; ufsdirhash_release(dh); } /* * Inform dirhash that the directory has grown by one block that * begins at offset (i.e. the new length is offset + DIRBLKSIZ). */ void ufsdirhash_newblk(struct inode *ip, doff_t offset) { struct dirhash *dh; int block; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_newblk: bad offset")); block = offset / DIRBLKSIZ; if (block >= dh->dh_nblk) { /* Out of space; must rebuild. */ ufsdirhash_free_locked(ip); return; } dh->dh_dirblks = block + 1; /* Account for the new free block. */ dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN; if (dh->dh_firstfree[DH_NFSTATS] == -1) dh->dh_firstfree[DH_NFSTATS] = block; ufsdirhash_release(dh); } /* * Inform dirhash that the directory is being truncated. */ void ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) { struct dirhash *dh; int block, i; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ, ("ufsdirhash_dirtrunc: bad offset")); block = howmany(offset, DIRBLKSIZ); /* * If the directory shrinks to less than 1/8 of dh_nblk blocks * (about 20% of its original size due to the 50% extra added in * ufsdirhash_build) then free it, and let the caller rebuild * if necessary. */ if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { ufsdirhash_free_locked(ip); return; } /* * Remove any `first free' information pertaining to the * truncated blocks. All blocks we're removing should be * completely unused. */ if (dh->dh_firstfree[DH_NFSTATS] >= block) dh->dh_firstfree[DH_NFSTATS] = -1; for (i = block; i < dh->dh_dirblks; i++) if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) panic("ufsdirhash_dirtrunc: blocks in use"); for (i = 0; i < DH_NFSTATS; i++) if (dh->dh_firstfree[i] >= block) panic("ufsdirhash_dirtrunc: first free corrupt"); dh->dh_dirblks = block; ufsdirhash_release(dh); } /* * Debugging function to check that the dirhash information about * a directory block matches its actual contents. Panics if a mismatch * is detected. * * On entry, `buf' should point to the start of an in-core * DIRBLKSIZ-sized directory block, and `offset' should contain the * offset from the start of the directory of that block. */ void ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset) { struct dirhash *dh; struct direct *dp; int block, ffslot, i, nfree; if (!ufs_dirhashcheck) return; if ((dh = ufsdirhash_acquire(ip)) == NULL) return; block = offset / DIRBLKSIZ; if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks) panic("ufsdirhash_checkblock: bad offset"); nfree = 0; for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) { dp = (struct direct *)(buf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ) panic("ufsdirhash_checkblock: bad dir"); if (dp->d_ino == 0) { #if 0 /* * XXX entries with d_ino == 0 should only occur * at the start of a DIRBLKSIZ block. However the * ufs code is tolerant of such entries at other * offsets, and fsck does not fix them. */ if (i != 0) panic("ufsdirhash_checkblock: bad dir inode"); #endif nfree += dp->d_reclen; continue; } /* Check that the entry exists (will panic if it doesn't). */ ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); nfree += dp->d_reclen - DIRSIZ(0, dp); } if (i != DIRBLKSIZ) panic("ufsdirhash_checkblock: bad dir end"); if (dh->dh_blkfree[block] * DIRALIGN != nfree) panic("ufsdirhash_checkblock: bad free count"); ffslot = BLKFREE2IDX(nfree / DIRALIGN); for (i = 0; i <= DH_NFSTATS; i++) if (dh->dh_firstfree[i] == block && i != ffslot) panic("ufsdirhash_checkblock: bad first-free"); if (dh->dh_firstfree[ffslot] == -1) panic("ufsdirhash_checkblock: missing first-free entry"); ufsdirhash_release(dh); } /* * Hash the specified filename into a dirhash slot. */ static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen) { u_int32_t hash; /* * We hash the name and then some other bit of data that is * invariant over the dirhash's lifetime. Otherwise names * differing only in the last byte are placed close to one * another in the table, which is bad for linear probing. */ hash = fnv_32_buf(name, namelen, FNV1_32_INIT); hash = fnv_32_buf(&dh, sizeof(dh), hash); return (hash % dh->dh_hlen); } /* * Adjust the number of free bytes in the block containing `offset' * by the value specified by `diff'. * * The caller must ensure we have exclusive access to `dh'; normally * that means that dh_lock should be held, but this is also called * from ufsdirhash_build() where exclusive access can be assumed. */ static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff) { int block, i, nfidx, ofidx; /* Update the per-block summary info. */ block = offset / DIRBLKSIZ; KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks, ("dirhash bad offset")); ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); /* Update the `first free' list if necessary. */ if (ofidx != nfidx) { /* If removing, scan forward for the next block. */ if (dh->dh_firstfree[ofidx] == block) { for (i = block + 1; i < dh->dh_dirblks; i++) if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) break; dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; } /* Make this the new `first free' if necessary */ if (dh->dh_firstfree[nfidx] > block || dh->dh_firstfree[nfidx] == -1) dh->dh_firstfree[nfidx] = block; } } /* * Find the specified name which should have the specified offset. * Returns a slot number, and panics on failure. * * `dh' must be locked on entry and remains so on return. */ static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset) { int slot; DIRHASH_ASSERT_LOCKED(dh); /* Find the entry. */ KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full")); slot = ufsdirhash_hash(dh, name, namelen); while (DH_ENTRY(dh, slot) != offset && DH_ENTRY(dh, slot) != DIRHASH_EMPTY) slot = WRAPINCR(slot, dh->dh_hlen); if (DH_ENTRY(dh, slot) != offset) panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); return (slot); } /* * Remove the entry corresponding to the specified slot from the hash array. * * `dh' must be locked on entry and remains so on return. */ static void ufsdirhash_delslot(struct dirhash *dh, int slot) { int i; DIRHASH_ASSERT_LOCKED(dh); /* Mark the entry as deleted. */ DH_ENTRY(dh, slot) = DIRHASH_DEL; /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) i = WRAPINCR(i, dh->dh_hlen); if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { i = WRAPDECR(i, dh->dh_hlen); while (DH_ENTRY(dh, i) == DIRHASH_DEL) { DH_ENTRY(dh, i) = DIRHASH_EMPTY; dh->dh_hused--; i = WRAPDECR(i, dh->dh_hlen); } KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen")); } } /* * Given a directory entry and its offset, find the offset of the * previous entry in the same DIRBLKSIZ-sized block. Returns an * offset, or -1 if there is no previous entry in the block or some * other problem occurred. */ static doff_t ufsdirhash_getprev(struct direct *dirp, doff_t offset) { struct direct *dp; char *blkbuf; doff_t blkoff, prevoff; int entrypos, i; blkoff = rounddown2(offset, DIRBLKSIZ); /* offset of start of block */ entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */ blkbuf = (char *)dirp - entrypos; prevoff = blkoff; /* If `offset' is the start of a block, there is no previous entry. */ if (entrypos == 0) return (-1); /* Scan from the start of the block until we get to the entry. */ for (i = 0; i < entrypos; i += dp->d_reclen) { dp = (struct direct *)(blkbuf + i); if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) return (-1); /* Corrupted directory. */ prevoff = blkoff + i; } return (prevoff); } /* * Delete the given dirhash and reclaim its memory. Assumes that * ufsdirhash_list is locked, and leaves it locked. Also assumes * that dh is locked. Returns the amount of memory freed. */ static int ufsdirhash_destroy(struct dirhash *dh) { doff_t **hash; u_int8_t *blkfree; int i, mem, narrays; KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list")); /* Remove it from the list and detach its memory. */ TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); dh->dh_onlist = 0; hash = dh->dh_hash; dh->dh_hash = NULL; blkfree = dh->dh_blkfree; dh->dh_blkfree = NULL; narrays = dh->dh_narrays; mem = dh->dh_memreq; dh->dh_memreq = 0; /* Unlock dirhash and free the detached memory. */ ufsdirhash_release(dh); for (i = 0; i < narrays; i++) DIRHASH_BLKFREE(hash[i]); free(hash, M_DIRHASH); free(blkfree, M_DIRHASH); /* Account for the returned memory. */ ufs_dirhashmem -= mem; return (mem); } /* * Try to free up `wanted' bytes by stealing memory from existing * dirhashes. Returns zero with list locked if successful. */ static int ufsdirhash_recycle(int wanted) { struct dirhash *dh; DIRHASHLIST_LOCK(); dh = TAILQ_FIRST(&ufsdirhash_list); while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { /* Decrement the score; only recycle if it becomes zero. */ if (dh == NULL || --dh->dh_score > 0) { DIRHASHLIST_UNLOCK(); return (-1); } /* * If we can't lock it it's in use and we don't want to * recycle it anyway. */ if (!sx_try_xlock(&dh->dh_lock)) { dh = TAILQ_NEXT(dh, dh_list); continue; } ufsdirhash_destroy(dh); /* Repeat if necessary. */ dh = TAILQ_FIRST(&ufsdirhash_list); } /* Success; return with list locked. */ return (0); } /* * Callback that frees some dirhashes when the system is low on virtual memory. */ static void ufsdirhash_lowmem() { struct dirhash *dh, *dh_temp; int memfreed, memwanted; ufs_dirhashlowmemcount++; memfreed = 0; memwanted = ufs_dirhashmem * ufs_dirhashreclaimpercent / 100; DIRHASHLIST_LOCK(); /* * Reclaim up to memwanted from the oldest dirhashes. This will allow * us to make some progress when the system is running out of memory * without compromising the dinamicity of maximum age. If the situation * does not improve lowmem will be eventually retriggered and free some * other entry in the cache. The entries on the head of the list should * be the oldest. If during list traversal we can't get a lock on the * dirhash, it will be skipped. */ TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) { if (sx_try_xlock(&dh->dh_lock)) memfreed += ufsdirhash_destroy(dh); if (memfreed >= memwanted) break; } DIRHASHLIST_UNLOCK(); } static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS) { int error, v; v = ufs_dirhashreclaimpercent; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == ufs_dirhashreclaimpercent) return (0); /* Refuse invalid percentages */ if (v < 0 || v > 100) return (EINVAL); ufs_dirhashreclaimpercent = v; return (0); } void ufsdirhash_init() { ufs_dirhashmaxmem = lmax(roundup(hibufspace / 64, PAGE_SIZE), 2 * 1024 * 1024); ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF); TAILQ_INIT(&ufsdirhash_list); /* Register a callback function to handle low memory signals */ EVENTHANDLER_REGISTER(vm_lowmem, ufsdirhash_lowmem, NULL, EVENTHANDLER_PRI_FIRST); } void ufsdirhash_uninit() { KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit")); uma_zdestroy(ufsdirhash_zone); mtx_destroy(&ufsdirhash_mtx); } #endif /* UFS_DIRHASH */ diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c index 0509185c4663..b7bf4eb6c86c 100644 --- a/sys/ufs/ufs/ufs_lookup.c +++ b/sys/ufs/ufs/ufs_lookup.c @@ -1,1609 +1,1605 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #include #include #include #ifdef DIAGNOSTIC static int dirchk = 1; #else static int dirchk = 0; #endif SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, ""); -/* true if old FS format...*/ -#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) - static int ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred, struct thread *td) { int error; #ifdef UFS_ACL /* * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt * * 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD */ /* * XXX: Is this check required? */ error = VOP_ACCESS(vdp, VEXEC, cred, td); if (error) return (error); error = VOP_ACCESSX(tdp, VDELETE, cred, td); if (error == 0) return (0); error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred, td); if (error == 0) return (0); error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred, td); if (error) return (error); #endif /* !UFS_ACL */ /* * Standard Unix access control - delete access requires VWRITE. */ error = VOP_ACCESS(vdp, VWRITE, cred, td); if (error) return (error); /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ if ((VTOI(vdp)->i_mode & ISVTX) && VOP_ACCESS(vdp, VADMIN, cred, td) && VOP_ACCESS(tdp, VADMIN, cred, td)) return (EPERM); return (0); } /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the filesystem is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending * on whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and vput * instead of two vputs. * * This routine is actually used as VOP_CACHEDLOOKUP method, and the * filesystem employs the generic vfs_cache_lookup() as VOP_LOOKUP * method. * * vfs_cache_lookup() performs the following for us: * check that it is a directory * check accessibility of directory * check for modification attempts on read-only mounts * if name found in cache * if at end of path and deleting or creating * drop it * else * return name. * return VOP_CACHEDLOOKUP() * * Overall outline of ufs_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache */ int ufs_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } int ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ struct buf *bp; /* a buffer of directory entries */ struct direct *ep; /* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ enum {NONE, COMPACT, FOUND} slotstatus; doff_t slotoffset; /* offset of area with free space */ doff_t i_diroff; /* cached i_diroff value. */ doff_t i_offset; /* cached i_offset value. */ int slotsize; /* size of area at slotoffset */ int slotfreespace; /* amount of space free in slot */ int slotneeded; /* size of the entry we're seeking */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ doff_t prevoff; /* prev entry dp->i_offset */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by VFS_VGET */ doff_t enduseful; /* pointer past last used dir slot */ u_long bmask; /* block offset mask */ int namlen, error; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; ino_t ino, ino1; int ltype; if (vpp != NULL) *vpp = NULL; dp = VTOI(vdp); if (dp->i_effnlink == 0) return (ENOENT); /* * Create a vm object if vmiodirenable is enabled. * Alternatively we could call vnode_create_vobject * in VFS_VGET but we could end up creating objects * that are never used. */ vnode_create_vobject(vdp, DIP(dp, i_size), cnp->cn_thread); bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; #ifdef DEBUG_VFS_LOCKS /* * Assert that the directory vnode is locked, and locked * exclusively for the last component lookup for modifying * operations. * * The directory-modifying operations need to save * intermediate state in the inode between namei() call and * actual directory manipulations. See fields in the struct * inode marked as 'used during directory lookup'. We must * ensure that upgrade in namei() does not happen, since * upgrade might need to unlock vdp. If quotas are enabled, * getinoquota() also requires exclusive lock to modify inode. */ ASSERT_VOP_LOCKED(vdp, "ufs_lookup1"); if ((nameiop == CREATE || nameiop == DELETE || nameiop == RENAME) && (flags & (LOCKPARENT | ISLASTCN)) == (LOCKPARENT | ISLASTCN)) ASSERT_VOP_ELOCKED(vdp, "ufs_lookup2"); #endif restart: bp = NULL; slotoffset = -1; /* * We now have a segment name to search for, and a directory to search. * * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ ino = 0; i_diroff = dp->i_diroff; slotstatus = FOUND; slotfreespace = slotsize = slotneeded = 0; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slotstatus = NONE; slotneeded = DIRECTSIZ(cnp->cn_namelen); } #ifdef UFS_DIRHASH /* * Use dirhash for fast operations on large directories. The logic * to determine whether to hash the directory is contained within * ufsdirhash_build(); a zero return means that it decided to hash * this directory and it successfully built up the hash table. */ if (ufsdirhash_build(dp) == 0) { /* Look for a free slot if needed. */ enduseful = dp->i_size; if (slotstatus != FOUND) { slotoffset = ufsdirhash_findfree(dp, slotneeded, &slotsize); if (slotoffset >= 0) { slotstatus = COMPACT; enduseful = ufsdirhash_enduseful(dp); if (enduseful < 0) enduseful = dp->i_size; } } /* Look up the component. */ numdirpasses = 1; entryoffsetinblock = 0; /* silence compiler warning */ switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, &i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { case 0: ep = (struct direct *)((char *)bp->b_data + (i_offset & bmask)); goto foundentry; case ENOENT: i_offset = roundup2(dp->i_size, DIRBLKSIZ); goto notfound; default: /* Something failed; just do a linear search. */ break; } } #endif /* UFS_DIRHASH */ /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ if (nameiop != LOOKUP || i_diroff == 0 || i_diroff >= dp->i_size) { entryoffsetinblock = 0; i_offset = 0; numdirpasses = 1; } else { i_offset = i_diroff; if ((entryoffsetinblock = i_offset & bmask) && (error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } prevoff = i_offset; endsearch = roundup2(dp->i_size, DIRBLKSIZ); enduseful = 0; searchloop: while (i_offset < endsearch) { /* * If necessary, get the next directory block. */ if ((i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp); if (error) return (error); entryoffsetinblock = 0; } /* * If still looking for a slot, and at a DIRBLKSIZE * boundary, have to start looking for free space again. */ if (slotstatus == NONE && (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { slotoffset = -1; slotfreespace = 0; } /* * Get pointer to next entry. * Full validation checks are slow, so we only check * enough to insure forward progress through the * directory. Complete checks can be run by patching * "dirchk" to be true. */ ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); if (ep->d_reclen == 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) { int i; ufs_dirbad(dp, i_offset, "mangled entry"); i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); i_offset += i; entryoffsetinblock += i; continue; } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. Also accumulate space * in the current block so that we can determine if * compaction is viable. */ if (slotstatus != FOUND) { int size = ep->d_reclen; if (ep->d_ino != 0) size -= DIRSIZ(OFSFMT(vdp), ep); if (size > 0) { if (size >= slotneeded) { slotstatus = FOUND; slotoffset = i_offset; slotsize = ep->d_reclen; } else if (slotstatus == NONE) { slotfreespace += size; if (slotoffset == -1) slotoffset = i_offset; if (slotfreespace >= slotneeded) { slotstatus = COMPACT; slotsize = i_offset + ep->d_reclen - slotoffset; } } } } /* * Check for a name match. */ if (ep->d_ino) { # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vdp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (namlen == cnp->cn_namelen && (cnp->cn_nameptr[0] == ep->d_name[0]) && !bcmp(cnp->cn_nameptr, ep->d_name, (unsigned)namlen)) { #ifdef UFS_DIRHASH foundentry: #endif /* * Save directory entry's inode number and * reclen in ndp->ni_ufs area, and release * directory buffer. */ - if (vdp->v_mount->mnt_maxsymlinklen > 0 && - ep->d_type == DT_WHT) { + if (!OFSFMT(vdp) && ep->d_type == DT_WHT) { slotstatus = FOUND; slotoffset = i_offset; slotsize = ep->d_reclen; enduseful = dp->i_size; cnp->cn_flags |= ISWHITEOUT; numdirpasses--; goto notfound; } ino = ep->d_ino; goto found; } } prevoff = i_offset; i_offset += ep->d_reclen; entryoffsetinblock += ep->d_reclen; if (ep->d_ino) enduseful = i_offset; } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; i_offset = 0; endsearch = i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * If creating, and at end of pathname and current * directory has not been removed, then can consider * allowing file to be created. */ if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && (cnp->cn_flags & DOWHITEOUT) && (cnp->cn_flags & ISWHITEOUT))) && (flags & ISLASTCN) && dp->i_effnlink != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. * * XXX: Fix the comment above. */ if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); else error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) return (error); /* * Return an indication of where the new directory * entry should be put. If we didn't find a slot, * then set dp->i_count to 0 indicating * that the new slot belongs at the end of the * directory. If we found a slot, then the new entry * can be put in the range from dp->i_offset to * dp->i_offset + dp->i_count. */ if (slotstatus == NONE) { SET_I_OFFSET(dp, roundup2(dp->i_size, DIRBLKSIZ)); SET_I_COUNT(dp, 0); enduseful = I_OFFSET(dp); } else if (nameiop == DELETE) { SET_I_OFFSET(dp, slotoffset); if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) SET_I_COUNT(dp, 0); else SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); } else { SET_I_OFFSET(dp, slotoffset); SET_I_COUNT(dp, slotsize); if (enduseful < slotoffset + slotsize) enduseful = slotoffset + slotsize; } SET_I_ENDOFF(dp, roundup2(enduseful, DIRBLKSIZ)); /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. * The pathname buffer is saved so that the name * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } /* * Insert name into cache (as non-existent) if appropriate. */ if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(vdp, NULL, cnp); return (ENOENT); found: if (dd_ino != NULL) *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Check that directory length properly reflects presence * of this entry. */ if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { ufs_dirbad(dp, i_offset, "i_size too small"); dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep); DIP_SET(dp, i_size, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); } brelse(bp); /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ); /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); if (VOP_ISLOCKED(vdp) == LK_EXCLUSIVE) { /* * Return pointer to current entry in * dp->i_offset, and distance past previous * entry (if there is a previous entry in this * block) in dp->i_count. * * We shouldn't be setting these in the * WANTPARENT case (first lookup in rename()), but any * lookups that will result in directory changes will * overwrite these. */ SET_I_OFFSET(dp, i_offset); if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) SET_I_COUNT(dp, 0); else SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); } if (dd_ino != NULL) return (0); /* * Save directory inode pointer in ndp->ni_dvp for * dirremove(). */ if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); if (error) { vput(tdp); return (error); } if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; vput(tdp); return (0); } *vpp = tdp; return (0); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); else error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ SET_I_OFFSET(dp, i_offset); if (dp->i_number == ino) return (EISDIR); if (dd_ino != NULL) return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); if (error) { vput(tdp); return (error); } #ifdef SunOS_doesnt_do_that /* * The only purpose of this check is to return the correct * error. Assume that we want to rename directory "a" * to a file "b", and that we have no ACL_WRITE_DATA on * a containing directory, but we _do_ have ACL_APPEND_DATA. * In that case, the VOP_ACCESS check above will return 0, * and the operation will fail with ENOTDIR instead * of EACCESS. */ if (tdp->v_type == VDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); else error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) { vput(tdp); return (error); } #endif *vpp = tdp; cnp->cn_flags |= SAVENAME; return (0); } if (dd_ino != NULL) return (0); /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the VFS_VGET for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the filesystem has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; if (flags & ISDOTDOT) { error = vn_vget_ino(pdp, ino, cnp->cn_lkflags, &tdp); if (error) return (error); /* * Recheck that ".." entry in the vdp directory points * to the inode we looked up before vdp lock was * dropped. */ error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); } if (ino1 != ino) { vput(tdp); goto restart; } *vpp = tdp; } else if (dp->i_number == ino) { VREF(vdp); /* we want ourself, ie "." */ /* * When we lookup "." we still can be asked to lock it * differently. */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(vdp)) { if (ltype == LK_EXCLUSIVE) vn_lock(vdp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(vdp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case may left us with * reclaimed vnode. */ if (VN_IS_DOOMED(vdp)) { vrele(vdp); return (ENOENT); } } *vpp = vdp; } else { error = VFS_VGET(pdp->v_mount, ino, cnp->cn_lkflags, &tdp); if (error == 0 && VTOI(tdp)->i_mode == 0) { vgone(tdp); vput(tdp); error = ENOENT; } if (error) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } void ufs_dirbad(ip, offset, how) struct inode *ip; doff_t offset; char *how; { struct mount *mp; mp = ITOV(ip)->v_mount; if ((mp->mnt_flag & MNT_RDONLY) == 0) panic("ufs_dirbad: %s: bad dir ino %ju at offset %ld: %s", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, (long)offset, how); else (void)printf("%s: bad dir ino %ju at offset %ld: %s\n", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, (long)offset, how); } /* * Do consistency checking on a directory entry: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than UFS_MAXNAMLEN * name must be as long as advertised, and null terminated */ int ufs_dirbadentry(dp, ep, entryoffsetinblock) struct vnode *dp; struct direct *ep; int entryoffsetinblock; { int i, namlen; # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if ((ep->d_reclen & 0x3) != 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > UFS_MAXNAMLEN) { /*return (1); */ printf("First bad\n"); goto bad; } if (ep->d_ino == 0) return (0); for (i = 0; i < namlen; i++) if (ep->d_name[i] == '\0') { /*return (1); */ printf("Second bad\n"); goto bad; } if (ep->d_name[i]) goto bad; return (0); bad: return (1); } /* * Construct a new directory entry after a call to namei, using the * parameters that it left in the componentname argument cnp. The * argument ip is the inode to which the new directory entry will refer. */ void ufs_makedirentry(ip, cnp, newdirp) struct inode *ip; struct componentname *cnp; struct direct *newdirp; { u_int namelen; namelen = (unsigned)cnp->cn_namelen; KASSERT((cnp->cn_flags & SAVENAME) != 0, ("ufs_makedirentry: missing name")); KASSERT(namelen <= UFS_MAXNAMLEN, ("ufs_makedirentry: name too long")); newdirp->d_ino = ip->i_number; newdirp->d_namlen = namelen; /* Zero out after-name padding */ *(u_int32_t *)(&newdirp->d_name[namelen & ~(DIR_ROUNDUP - 1)]) = 0; bcopy(cnp->cn_nameptr, newdirp->d_name, namelen); - if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) + if (!OFSFMT(ITOV(ip))) newdirp->d_type = IFTODT(ip->i_mode); else { newdirp->d_type = 0; # if (BYTE_ORDER == LITTLE_ENDIAN) { u_char tmp = newdirp->d_namlen; newdirp->d_namlen = newdirp->d_type; newdirp->d_type = tmp; } # endif } } /* * Write a directory entry after a call to namei, using the parameters * that it left in nameidata. The argument dirp is the new directory * entry contents. Dvp is a pointer to the directory to be written, * which was left locked by namei. Remaining parameters (dp->i_offset, * dp->i_count) indicate how the space for the new entry is to be obtained. * Non-null bp indicates that a directory is being created (for the * soft dependency code). */ int ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) struct vnode *dvp; struct vnode *tvp; struct direct *dirp; struct componentname *cnp; struct buf *newdirbp; { struct ucred *cr; struct thread *td; int newentrysize; struct inode *dp; struct buf *bp; u_int dsize; struct direct *ep, *nep; u_int64_t old_isize; int error, ret, blkoff, loc, spacefree, flags, namlen; char *dirbuf; td = curthread; /* XXX */ cr = td->td_ucred; dp = VTOI(dvp); newentrysize = DIRSIZ(OFSFMT(dvp), dirp); if (I_COUNT(dp) == 0) { /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ if (I_OFFSET(dp) & (DIRBLKSIZ - 1)) panic("ufs_direnter: newblk"); flags = BA_CLRBUF; if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) flags |= IO_SYNC; #ifdef QUOTA if ((error = getinoquota(dp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } #endif old_isize = dp->i_size; vnode_pager_setsize(dvp, (u_long)I_OFFSET(dp) + DIRBLKSIZ); if ((error = UFS_BALLOC(dvp, (off_t)I_OFFSET(dp), DIRBLKSIZ, cr, flags, &bp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); vnode_pager_setsize(dvp, (u_long)old_isize); return (error); } dp->i_size = I_OFFSET(dp) + DIRBLKSIZ; DIP_SET(dp, i_size, dp->i_size); SET_I_ENDOFF(dp, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); dirp->d_reclen = DIRBLKSIZ; blkoff = I_OFFSET(dp) & (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, I_OFFSET(dp)); ufsdirhash_add(dp, dirp, I_OFFSET(dp)); ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, I_OFFSET(dp)); } #endif if (DOINGSOFTDEP(dvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff += DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } if (softdep_setup_directory_add(bp, dp, I_OFFSET(dp), dirp->d_ino, newdirbp, 1)) UFS_INODE_SET_FLAG(dp, IN_NEEDSYNC); if (newdirbp) bdwrite(newdirbp); bdwrite(bp); return (UFS_UPDATE(dvp, 0)); } if (DOINGASYNC(dvp)) { bdwrite(bp); return (UFS_UPDATE(dvp, 0)); } error = bwrite(bp); ret = UFS_UPDATE(dvp, 1); if (error == 0) return (ret); return (error); } /* * If dp->i_count is non-zero, then namei found space for the new * entry in the range dp->i_offset to dp->i_offset + dp->i_count * in the directory. To use this space, we may have to compact * the entries located there, by copying them together towards the * beginning of the block, leaving the free space in one usable * chunk at the end. */ /* * Increase size of directory if entry eats into new space. * This should never push the size past a new multiple of * DIRBLKSIZE. * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ if (I_OFFSET(dp) + I_COUNT(dp) > dp->i_size) { dp->i_size = I_OFFSET(dp) + I_COUNT(dp); DIP_SET(dp, i_size, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_MODIFIED); } /* * Get the block containing the space for the new directory entry. */ error = UFS_BLKATOFF(dvp, (off_t)I_OFFSET(dp), &dirbuf, &bp); if (error) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } /* * Find space for the new entry. In the simple case, the entry at * offset base will have the space. If it does not, then namei * arranged that compacting the region dp->i_offset to * dp->i_offset + dp->i_count would yield the space. */ ep = (struct direct *)dirbuf; dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; spacefree = ep->d_reclen - dsize; for (loc = ep->d_reclen; loc < I_COUNT(dp); ) { nep = (struct direct *)(dirbuf + loc); /* Trim the existing slot (NB: dsize may be zero). */ ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); /* Read nep->d_reclen now as the bcopy() may clobber it. */ loc += nep->d_reclen; if (nep->d_ino == 0) { /* * A mid-block unused entry. Such entries are * never created by the kernel, but fsck_ffs * can create them (and it doesn't fix them). * * Add up the free space, and initialise the * relocated entry since we don't bcopy it. */ spacefree += nep->d_reclen; ep->d_ino = 0; dsize = 0; continue; } dsize = DIRSIZ(OFSFMT(dvp), nep); spacefree += nep->d_reclen - dsize; #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_move(dp, nep, I_OFFSET(dp) + ((char *)nep - dirbuf), I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) softdep_change_directoryentry_offset(bp, dp, dirbuf, (caddr_t)nep, (caddr_t)ep, dsize); else bcopy((caddr_t)nep, (caddr_t)ep, dsize); } /* * Here, `ep' points to a directory entry containing `dsize' in-use * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, * then the entry is completely unused (dsize == 0). The value * of ep->d_reclen is always indeterminate. * * Update the pointer fields in the previous entry (if any), * copy in the new entry, and write out the block. */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dvp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (ep->d_ino == 0 || (ep->d_ino == UFS_WINO && namlen == dirp->d_namlen && bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { if (spacefree + dsize < newentrysize) panic("ufs_direnter: compact1"); dirp->d_reclen = spacefree + dsize; } else { if (spacefree < newentrysize) panic("ufs_direnter: compact2"); dirp->d_reclen = spacefree; ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL && (ep->d_ino == 0 || dirp->d_reclen == spacefree)) ufsdirhash_add(dp, dirp, I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf - (I_OFFSET(dp) & (DIRBLKSIZ - 1)), rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif if (DOINGSOFTDEP(dvp)) { (void) softdep_setup_directory_add(bp, dp, I_OFFSET(dp) + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); if (newdirbp != NULL) bdwrite(newdirbp); bdwrite(bp); } else { if (DOINGASYNC(dvp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } /* * If all went well, and the directory can be shortened, * mark directory inode with the truncation request. */ UFS_INODE_SET_FLAG(dp, IN_CHANGE | IN_UPDATE | (error == 0 && I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size ? IN_ENDOFF : 0)); return (error); } /* * Remove a directory entry after a call to namei, using * the parameters which it left in nameidata. The entry * dp->i_offset contains the offset into the directory of the * entry to be eliminated. The dp->i_count field contains the * size of the previous record in the directory. If this * is 0, the first entry is being deleted, so we need only * zero the inode number to mark the entry as free. If the * entry is not the first in the directory, we must reclaim * the space of the now empty record by adding the record size * to the size of the previous entry. */ int ufs_dirremove(dvp, ip, flags, isrmdir) struct vnode *dvp; struct inode *ip; int flags; int isrmdir; { struct inode *dp; struct direct *ep, *rep; struct buf *bp; off_t offset; int error; dp = VTOI(dvp); /* * Adjust the link count early so softdep can block if necessary. */ if (ip) { ip->i_effnlink--; UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(dvp)) { softdep_setup_unlink(dp, ip); } else { ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); } } if (flags & DOWHITEOUT) offset = I_OFFSET(dp); else offset = I_OFFSET(dp) - I_COUNT(dp); if ((error = UFS_BLKATOFF(dvp, offset, (char **)&ep, &bp)) != 0) { if (ip) { ip->i_effnlink++; UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(dvp)) { softdep_change_linkcnt(ip); } else { ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); } } return (error); } if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to UFS_WINO. */ ep->d_ino = UFS_WINO; ep->d_type = DT_WHT; goto out; } /* Set 'rep' to the entry being removed. */ if (I_COUNT(dp) == 0) rep = ep; else rep = (struct direct *)((char *)ep + ep->d_reclen); #ifdef UFS_DIRHASH /* * Remove the dirhash entry. This is complicated by the fact * that `ep' is the previous entry when dp->i_count != 0. */ if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, rep, I_OFFSET(dp)); #endif if (ip && rep->d_ino != ip->i_number) panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n", (uintmax_t)ip->i_number, (uintmax_t)rep->d_ino); /* * Zero out the file directory entry metadata to reduce disk * scavenging disclosure. */ bzero(&rep->d_name[0], rep->d_namlen); rep->d_namlen = 0; rep->d_type = 0; rep->d_ino = 0; if (I_COUNT(dp) != 0) { /* * Collapse new free space into previous entry. */ ep->d_reclen += rep->d_reclen; rep->d_reclen = 0; } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep - ((I_OFFSET(dp) - I_COUNT(dp)) & (DIRBLKSIZ - 1)), rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif out: error = 0; if (DOINGSOFTDEP(dvp)) { if (ip) softdep_setup_remove(bp, dp, ip, isrmdir); if (softdep_slowdown(dvp)) error = bwrite(bp); else bdwrite(bp); } else { if (flags & DOWHITEOUT) error = bwrite(bp); else if (DOINGASYNC(dvp)) bdwrite(bp); else error = bwrite(bp); } UFS_INODE_SET_FLAG(dp, IN_CHANGE | IN_UPDATE); /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if (ip != NULL && (ip->i_flags & SF_SNAPSHOT) != 0 && ip->i_effnlink == 0) UFS_SNAPGONE(ip); return (error); } /* * Rewrite an existing directory entry to point at the inode * supplied. The parameters describing the directory entry are * set up by a call to namei. */ int ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) struct inode *dp, *oip; ino_t newinum; int newtype; int isrmdir; { struct buf *bp; struct direct *ep; struct vnode *vdp = ITOV(dp); int error; /* * Drop the link before we lock the buf so softdep can block if * necessary. */ oip->i_effnlink--; UFS_INODE_SET_FLAG(oip, IN_CHANGE); if (DOINGSOFTDEP(vdp)) { softdep_setup_unlink(dp, oip); } else { oip->i_nlink--; DIP_SET(oip, i_nlink, oip->i_nlink); UFS_INODE_SET_FLAG(oip, IN_CHANGE); } error = UFS_BLKATOFF(vdp, (off_t)I_OFFSET(dp), (char **)&ep, &bp); if (error == 0 && ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' && ep->d_ino != oip->i_number) { brelse(bp); error = EIDRM; } if (error) { oip->i_effnlink++; UFS_INODE_SET_FLAG(oip, IN_CHANGE); if (DOINGSOFTDEP(vdp)) { softdep_change_linkcnt(oip); } else { oip->i_nlink++; DIP_SET(oip, i_nlink, oip->i_nlink); UFS_INODE_SET_FLAG(oip, IN_CHANGE); } return (error); } ep->d_ino = newinum; if (!OFSFMT(vdp)) ep->d_type = newtype; if (DOINGSOFTDEP(vdp)) { softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { if (DOINGASYNC(vdp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } UFS_INODE_SET_FLAG(dp, IN_CHANGE | IN_UPDATE); /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_effnlink == 0) UFS_SNAPGONE(oip); return (error); } /* * Check if a directory is empty or not. * Inode supplied must be locked. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * NB: does not handle corrupted directories. */ int ufs_dirempty(ip, parentino, cred) struct inode *ip; ino_t parentino; struct ucred *cred; { doff_t off; struct dirtemplate dbuf; struct direct *dp = (struct direct *)&dbuf; int error, namlen; ssize_t count; #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) for (off = 0; off < ip->i_size; off += dp->d_reclen) { error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, &count, (struct thread *)0); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (error || count != 0) return (0); /* avoid infinite loops */ if (dp->d_reclen == 0) return (0); /* skip empty entries */ if (dp->d_ino == 0 || dp->d_ino == UFS_WINO) continue; /* accept only "." and ".." */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(ITOV(ip))) namlen = dp->d_type; else namlen = dp->d_namlen; # else namlen = dp->d_namlen; # endif if (namlen > 2) return (0); if (dp->d_name[0] != '.') return (0); /* * At this point namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (namlen == 1 && dp->d_ino == ip->i_number) continue; if (dp->d_name[1] == '.' && dp->d_ino == parentino) continue; return (0); } return (1); } static int ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino, struct vnode **dd_vp) { struct dirtemplate dirbuf; struct vnode *ddvp; int error, namlen; ASSERT_VOP_LOCKED(vp, "ufs_dir_dd_ino"); *dd_vp = NULL; if (vp->v_type != VDIR) return (ENOTDIR); /* * First check to see if we have it in the name cache. */ if ((ddvp = vn_dir_dd_ino(vp)) != NULL) { KASSERT(ddvp->v_mount == vp->v_mount, ("ufs_dir_dd_ino: Unexpected mount point crossing")); *dd_ino = VTOI(ddvp)->i_number; *dd_vp = ddvp; return (0); } /* * Have to read the directory. */ error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, NULL, NULL); if (error != 0) return (error); #if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vp)) namlen = dirbuf.dotdot_type; else namlen = dirbuf.dotdot_namlen; #else namlen = dirbuf.dotdot_namlen; #endif if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') return (ENOTDIR); *dd_ino = dirbuf.dotdot_ino; return (0); } /* * Check if source directory is in the path of the target directory. */ int ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) { struct mount *mp; struct vnode *tvp, *vp, *vp1; int error; ino_t dd_ino; vp = tvp = ITOV(target); mp = vp->v_mount; *wait_ino = 0; if (target->i_number == source_ino) return (EEXIST); if (target->i_number == parent_ino) return (0); if (target->i_number == UFS_ROOTINO) return (0); for (;;) { error = ufs_dir_dd_ino(vp, cred, &dd_ino, &vp1); if (error != 0) break; if (dd_ino == source_ino) { error = EINVAL; break; } if (dd_ino == UFS_ROOTINO) break; if (dd_ino == parent_ino) break; if (vp1 == NULL) { error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1); if (error != 0) { *wait_ino = dd_ino; break; } } KASSERT(dd_ino == VTOI(vp1)->i_number, ("directory %ju reparented\n", (uintmax_t)VTOI(vp1)->i_number)); if (vp != tvp) vput(vp); vp = vp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (vp1 != NULL) vput(vp1); if (vp != tvp) vput(vp); return (error); } #ifdef DIAGNOSTIC static void ufs_assert_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, const char *name, const char *file, int line) { char msg[128]; snprintf(msg, sizeof(msg), "at %s@%d", file, line); ASSERT_VOP_ELOCKED(ITOV(ip), msg); MPASS((ip->i_mode & IFMT) == IFDIR); if (curthread == tr->tr_owner && ip->i_lock_gen == tr->tr_gen) return; printf("locked at\n"); stack_print(&tr->tr_st); printf("unlocked at\n"); stack_print(&tr->tr_unlock); panic("%s ip %p %jd offset owner %p %d gen %d " "curthread %p %d gen %d at %s@%d\n", name, ip, (uintmax_t)ip->i_number, tr->tr_owner, tr->tr_owner->td_tid, tr->tr_gen, curthread, curthread->td_tid, ip->i_lock_gen, file, line); } static void ufs_set_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, const char *file, int line) { char msg[128]; snprintf(msg, sizeof(msg), "at %s@%d", file, line); ASSERT_VOP_ELOCKED(ITOV(ip), msg); MPASS((ip->i_mode & IFMT) == IFDIR); tr->tr_owner = curthread; tr->tr_gen = ip->i_lock_gen; stack_save(&tr->tr_st); } static void ufs_init_one_tracker(struct iown_tracker *tr) { tr->tr_owner = NULL; stack_zero(&tr->tr_st); } void ufs_init_trackers(struct inode *ip) { ufs_init_one_tracker(&ip->i_offset_tracker); ufs_init_one_tracker(&ip->i_count_tracker); ufs_init_one_tracker(&ip->i_endoff_tracker); } void ufs_unlock_tracker(struct inode *ip) { if (ip->i_count_tracker.tr_gen == ip->i_lock_gen) stack_save(&ip->i_count_tracker.tr_unlock); if (ip->i_offset_tracker.tr_gen == ip->i_lock_gen) stack_save(&ip->i_offset_tracker.tr_unlock); if (ip->i_endoff_tracker.tr_gen == ip->i_lock_gen) stack_save(&ip->i_endoff_tracker.tr_unlock); ip->i_lock_gen++; } doff_t ufs_get_i_offset(struct inode *ip, const char *file, int line) { ufs_assert_inode_offset_owner(ip, &ip->i_offset_tracker, "i_offset", file, line); return (ip->i_offset); } void ufs_set_i_offset(struct inode *ip, doff_t off, const char *file, int line) { ufs_set_inode_offset_owner(ip, &ip->i_offset_tracker, file, line); ip->i_offset = off; } int32_t ufs_get_i_count(struct inode *ip, const char *file, int line) { ufs_assert_inode_offset_owner(ip, &ip->i_count_tracker, "i_count", file, line); return (ip->i_count); } void ufs_set_i_count(struct inode *ip, int32_t cnt, const char *file, int line) { ufs_set_inode_offset_owner(ip, &ip->i_count_tracker, file, line); ip->i_count = cnt; } doff_t ufs_get_i_endoff(struct inode *ip, const char *file, int line) { ufs_assert_inode_offset_owner(ip, &ip->i_endoff_tracker, "i_endoff", file, line); return (ip->i_endoff); } void ufs_set_i_endoff(struct inode *ip, doff_t off, const char *file, int line) { ufs_set_inode_offset_owner(ip, &ip->i_endoff_tracker, file, line); ip->i_endoff = off; } #endif diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 70bf1a1d9036..ef288a32e815 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1,3014 +1,3014 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_suiddir.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #ifdef UFS_GJOURNAL #include FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS"); #endif #ifdef QUOTA FEATURE(ufs_quota, "UFS disk quotas support"); FEATURE(ufs_quota64, "64bit UFS disk quotas support"); #endif #ifdef SUIDDIR FEATURE(suiddir, "Give all new files in directory the same ownership as the directory"); #endif VFS_SMR_DECLARE; #include static vop_accessx_t ufs_accessx; static vop_fplookup_vexec_t ufs_fplookup_vexec; static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ufs_close; static vop_create_t ufs_create; static vop_stat_t ufs_stat; static vop_getattr_t ufs_getattr; static vop_ioctl_t ufs_ioctl; static vop_link_t ufs_link; static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *); static vop_mmapped_t ufs_mmapped; static vop_mkdir_t ufs_mkdir; static vop_mknod_t ufs_mknod; static vop_open_t ufs_open; static vop_pathconf_t ufs_pathconf; static vop_print_t ufs_print; static vop_readlink_t ufs_readlink; static vop_remove_t ufs_remove; static vop_rename_t ufs_rename; static vop_rmdir_t ufs_rmdir; static vop_setattr_t ufs_setattr; static vop_strategy_t ufs_strategy; static vop_symlink_t ufs_symlink; static vop_whiteout_t ufs_whiteout; static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "UFS filesystem"); /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; static void ufs_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if (UFS_RDONLY(ip)) goto out; if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) UFS_INODE_SET_FLAG(ip, IN_LAZYMOD); else if (((vp->v_mount->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) || (ip->i_flag & (IN_CHANGE | IN_UPDATE))) UFS_INODE_SET_FLAG(ip, IN_MODIFIED); else if (ip->i_flag & IN_ACCESS) UFS_INODE_SET_FLAG(ip, IN_LAZYACCESS); vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { DIP_SET(ip, i_atime, ts.tv_sec); DIP_SET(ip, i_atimensec, ts.tv_nsec); } if (ip->i_flag & IN_UPDATE) { DIP_SET(ip, i_mtime, ts.tv_sec); DIP_SET(ip, i_mtimensec, ts.tv_nsec); } if (ip->i_flag & IN_CHANGE) { DIP_SET(ip, i_ctime, ts.tv_sec); DIP_SET(ip, i_ctimensec, ts.tv_nsec); DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1); } out: ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ufs_itimes(struct vnode *vp) { VI_LOCK(vp); ufs_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create"); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ static int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod"); if (error) return (error); ip = VTOI(*vpp); UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ DIP_SET(ip, i_rdev, vap->va_rdev); } /* * Remove inode, then reload it through VFS_VGET(). This is * needed to do further inode initialization, for instance * fifo, which was too early for VFS_VGET() done as part of * UFS_VALLOC(). */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } /* * Open called. */ /* ARGSUSED */ static int ufs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); ip = VTOI(vp); vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); if (vp->v_type == VREG && (vn_irflag_read(vp) & VIRF_PGREAD) == 0) { vn_irflag_set_cond(vp, VIRF_PGREAD); } /* * Files marked append-only must be opened for appending. */ if ((ip->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ufs_accessx(ap) struct vop_accessx_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; #ifdef UFS_ACL struct acl *acl; acl_type_t type; #endif /* * Disallow write attempts on read-only filesystems; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VMODIFY_PERMS) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA /* * Inode is accounted in the quotas only if struct * dquot is attached to it. VOP_ACCESS() is called * from vn_open_cred() and provides a convenient * point to call getinoquota(). The lock mode is * exclusive when the file is opening for write. */ if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { error = getinoquota(ip); if (error != 0) return (error); } #endif break; default: break; } } /* * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" * permits the owner of the file to remove the IMMUTABLE flag. */ if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) { if (vp->v_mount->mnt_flag & MNT_NFS4ACLS) type = ACL_TYPE_NFS4; else type = ACL_TYPE_ACCESS; acl = acl_alloc(M_WAITOK); if (type == ACL_TYPE_NFS4) error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td); else error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td); switch (error) { case 0: if (type == ACL_TYPE_NFS4) { error = vaccess_acl_nfs4(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred); } else { error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred); } break; default: if (error != EOPNOTSUPP) printf( "ufs_accessx(): Error retrieving ACL on object (%d).\n", error); /* * XXX: Fall back until debugged. Should * eventually possibly log an error, and return * EPERM for safety. */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred); } acl_free(acl); return (error); } #endif /* !UFS_ACL */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred); return (error); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ static int ufs_fplookup_vexec(ap) struct vop_fplookup_vexec_args /* { struct vnode *a_vp; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp; struct inode *ip; struct ucred *cred; mode_t all_x, mode; vp = ap->a_vp; ip = VTOI_SMR(vp); if (__predict_false(ip == NULL)) return (EAGAIN); /* * XXX ACL race * * ACLs are not supported and UFS clears/sets this flag on mount and * remount. However, we may still be racing with seeing them and there * is no provision to make sure they were accounted for. This matches * the behavior of the locked case, since the lookup there is also * racy: mount takes no measures to block anyone from progressing. */ all_x = S_IXUSR | S_IXGRP | S_IXOTH; mode = atomic_load_short(&ip->i_mode); if (__predict_true((mode & all_x) == all_x)) return (0); cred = ap->a_cred; return (vaccess_vexec_smr(mode, ip->i_uid, ip->i_gid, cred)); } /* ARGSUSED */ static int ufs_stat(struct vop_stat_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct stat *sb = ap->a_sb; int error; error = vop_stat_helper_pre(ap); if (__predict_false(error)) return (error); VI_LOCK(vp); ufs_itimes_locked(vp); if (I_IS_UFS1(ip)) { sb->st_atim.tv_sec = ip->i_din1->di_atime; sb->st_atim.tv_nsec = ip->i_din1->di_atimensec; } else { sb->st_atim.tv_sec = ip->i_din2->di_atime; sb->st_atim.tv_nsec = ip->i_din2->di_atimensec; } VI_UNLOCK(vp); sb->st_dev = dev2udev(ITOUMP(ip)->um_dev); sb->st_ino = ip->i_number; sb->st_mode = (ip->i_mode & ~IFMT) | VTTOIF(vp->v_type); sb->st_nlink = ip->i_effnlink; sb->st_uid = ip->i_uid; sb->st_gid = ip->i_gid; if (I_IS_UFS1(ip)) { sb->st_rdev = ip->i_din1->di_rdev; sb->st_size = ip->i_din1->di_size; sb->st_mtim.tv_sec = ip->i_din1->di_mtime; sb->st_mtim.tv_nsec = ip->i_din1->di_mtimensec; sb->st_ctim.tv_sec = ip->i_din1->di_ctime; sb->st_ctim.tv_nsec = ip->i_din1->di_ctimensec; sb->st_birthtim.tv_sec = -1; sb->st_birthtim.tv_nsec = 0; sb->st_blocks = dbtob((u_quad_t)ip->i_din1->di_blocks) / S_BLKSIZE; } else { sb->st_rdev = ip->i_din2->di_rdev; sb->st_size = ip->i_din2->di_size; sb->st_mtim.tv_sec = ip->i_din2->di_mtime; sb->st_mtim.tv_nsec = ip->i_din2->di_mtimensec; sb->st_ctim.tv_sec = ip->i_din2->di_ctime; sb->st_ctim.tv_nsec = ip->i_din2->di_ctimensec; sb->st_birthtim.tv_sec = ip->i_din2->di_birthtime; sb->st_birthtim.tv_nsec = ip->i_din2->di_birthnsec; sb->st_blocks = dbtob((u_quad_t)ip->i_din2->di_blocks) / S_BLKSIZE; } sb->st_blksize = max(PAGE_SIZE, vp->v_mount->mnt_stat.f_iosize); sb->st_flags = ip->i_flags; sb->st_gen = ip->i_gen; return (vop_stat_helper_post(ap, error)); } /* ARGSUSED */ static int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; VI_LOCK(vp); ufs_itimes_locked(vp); if (I_IS_UFS1(ip)) { vap->va_atime.tv_sec = ip->i_din1->di_atime; vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; } else { vap->va_atime.tv_sec = ip->i_din2->di_atime; vap->va_atime.tv_nsec = ip->i_din2->di_atimensec; } VI_UNLOCK(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_effnlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; if (I_IS_UFS1(ip)) { vap->va_rdev = ip->i_din1->di_rdev; vap->va_size = ip->i_din1->di_size; vap->va_mtime.tv_sec = ip->i_din1->di_mtime; vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din1->di_ctime; vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec; vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks); vap->va_filerev = ip->i_din1->di_modrev; } else { vap->va_rdev = ip->i_din2->di_rdev; vap->va_size = ip->i_din2->di_size; vap->va_mtime.tv_sec = ip->i_din2->di_mtime; vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din2->di_ctime; vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec; vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime; vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec; vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks); vap->va_filerev = ip->i_din2->di_modrev; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_type = IFTOVT(ip->i_mode); return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } /* The snapshot flag cannot be toggled. */ if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT) return (EPERM); } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; DIP_SET(ip, i_flags, vap->va_flags); UFS_INODE_SET_FLAG(ip, IN_CHANGE); error = UFS_UPDATE(vp, 0); if (ip->i_flags & (IMMUTABLE | APPEND)) return (error); } /* * If immutable or append, no one can change any of its attributes * except the ones already handled (in some cases, file flags * including the immutability flags themselves for the superuser). */ if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * XXX most of the following special cases should be in * callers instead of in N filesystems. The VDIR check * mostly already is. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: /* * Truncation should have an effect in these cases. * Disallow it if the filesystem is read-only or * the file is being snapshotted. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL | ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0), cred)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); error = vn_utimes_perm(vp, vap, cred, td); if (error != 0) return (error); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; DIP_SET(ip, i_atime, vap->va_atime.tv_sec); DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec); } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec); DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec); } if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) { ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) return (EPERM); error = ufs_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } #ifdef UFS_ACL static int ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode, int file_owner_id, struct ucred *cred, struct thread *td) { int error; struct acl *aclp; aclp = acl_alloc(M_WAITOK); error = ufs_getacl_nfs4_internal(vp, aclp, td); /* * We don't have to handle EOPNOTSUPP here, as the filesystem claims * it supports ACLs. */ if (error) goto out; acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id); error = ufs_setacl_nfs4_internal(vp, aclp, td); out: acl_free(aclp); return (error); } #endif /* UFS_ACL */ static int ufs_mmapped(ap) struct vop_mmapped_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp; struct inode *ip; struct mount *mp; vp = ap->a_vp; ip = VTOI(vp); mp = vp->v_mount; if ((mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS); /* * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there. */ return (0); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, td) struct vnode *vp; int mode; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); int newmode, error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. Both of these are allowed in * jail(8). */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } /* * Deny setting setuid if we are not the file owner. */ if ((mode & ISUID) && ip->i_uid != cred->cr_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN); if (error) return (error); } newmode = ip->i_mode & ~ALLPERMS; newmode |= (mode & ALLPERMS); UFS_INODE_SET_MODE(ip, newmode); DIP_SET(ip, i_mode, ip->i_mode); UFS_INODE_SET_FLAG(ip, IN_CHANGE); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td); #endif if (error == 0 && (ip->i_flag & IN_CHANGE) != 0) error = UFS_UPDATE(vp, 0); return (error); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, td) struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA int i; ufs2_daddr_t change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if (((uid != ip->i_uid && uid != cred->cr_uid) || (gid != ip->i_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = DIP(ip, i_blocks); (void) chkdq(ip, -change, cred, CHOWN|FORCE); (void) chkiq(ip, -1, cred, CHOWN|FORCE); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; DIP_SET(ip, i_gid, gid); ip->i_uid = uid; DIP_SET(ip, i_uid, uid); #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; DIP_SET(ip, i_gid, ogid); ip->i_uid = ouid; DIP_SET(ip, i_uid, ouid); if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ UFS_INODE_SET_FLAG(ip, IN_CHANGE); if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID)); DIP_SET(ip, i_mode, ip->i_mode); } } error = UFS_UPDATE(vp, 0); return (error); } static int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; struct thread *td; td = curthread; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) return (EPERM); if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, vp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); if (ip->i_nlink <= 0) vp->v_vflag |= VV_NOSYNC; if ((ip->i_flags & SF_SNAPSHOT) != 0) { /* * Avoid deadlock where another thread is trying to * update the inodeblock for dvp and is waiting on * snaplk. Temporary unlock the vnode lock for the * unlinked file and sync the directory. This should * allow vput() of the directory to not block later on * while holding the snapshot vnode locked, assuming * that the directory hasn't been unlinked too. */ VOP_UNLOCK(vp); (void) VOP_FSYNC(dvp, MNT_WAIT, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } return (error); } static void print_bad_link_count(const char *funcname, struct vnode *dvp) { struct inode *dip; dip = VTOI(dvp); uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n", funcname, dip->i_effnlink, (intmax_t)dip->i_number, dvp->v_mount->mnt_stat.f_mntonname); } /* * link vnode call */ static int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; struct direct newdir; int error; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (DOINGSUJ(tdvp)) { error = softdep_prelink(tdvp, vp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } if (VTOI(tdvp)->i_effnlink < 2) { print_bad_link_count("ufs_link", tdvp); error = EINVAL; goto out; } ip = VTOI(vp); if (ip->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto out; } /* * The file may have been removed after namei droped the original * lock. */ if (ip->i_effnlink == 0) { error = ENOENT; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(vp)) softdep_setup_link(VTOI(tdvp), ip); error = UFS_UPDATE(vp, !DOINGSOFTDEP(vp) && !DOINGASYNC(vp)); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); } if (error) { ip->i_effnlink--; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(vp)) softdep_revert_link(VTOI(tdvp), ip); } out: return (error); } /* * whiteout vnode call */ static int ufs_whiteout(ap) struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; if (DOINGSUJ(dvp) && (ap->a_flags == CREATE || ap->a_flags == DELETE)) { error = softdep_prelink(dvp, NULL); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ - if (dvp->v_mount->mnt_maxsymlinklen > 0) + if (!OFSFMT(dvp)) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef INVARIANTS if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_whiteout: missing name"); - if (dvp->v_mount->mnt_maxsymlinklen <= 0) + if (OFSFMT(dvp)) panic("ufs_whiteout: old format filesystem"); #endif newdir.d_ino = UFS_WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); break; case DELETE: /* remove an existing directory whiteout */ #ifdef INVARIANTS - if (dvp->v_mount->mnt_maxsymlinklen <= 0) + if (OFSFMT(dvp)) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } return (error); } static volatile int rename_restarts; SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, __DEVOLATILE(int *, &rename_restarts), 0, "Times rename had to restart due to lock contention"); /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; off_t endoff; int doingdirectory, newparent; int error = 0; struct mount *mp; ino_t ino; bool want_seqc_end; want_seqc_end = false; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif endoff = 0; mp = tdvp->v_mount; VOP_UNLOCK(tdvp); if (tvp && tvp != tdvp) VOP_UNLOCK(tvp); /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; mp = NULL; goto releout; } relock: /* * We need to acquire 2 to 4 locks depending on whether tvp is NULL * and fdvp and tdvp are the same directory. Subsequently we need * to double-check all paths and in the directory rename case we * need to verify that we are not creating a directory loop. To * handle this we acquire all but fdvp using non-blocking * acquisitions. If we fail to acquire any lock in the path we will * drop all held locks, acquire the new lock in a blocking fashion, * and then release it and restart the rename. This acquire/release * step ensures that we do not spin on a lock waiting for release. */ error = vn_lock(fdvp, LK_EXCLUSIVE); if (error) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto releout; VOP_UNLOCK(tdvp); atomic_add_int(&rename_restarts, 1); goto relock; } /* * Re-resolve fvp to be certain it still exists and fetch the * correct vnode. */ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); goto releout; } error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (error) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp); vrele(fvp); fvp = nvp; atomic_add_int(&rename_restarts, 1); goto relock; } vrele(fvp); fvp = nvp; /* * Re-resolve tvp and acquire the vnode lock if present. */ error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); if (error != 0 && error != EJUSTRETURN) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); goto releout; } /* * If tvp disappeared we just carry on. */ if (error == EJUSTRETURN && tvp != NULL) { vrele(tvp); tvp = NULL; } /* * Get the tvp ino if the lookup succeeded. We may have to restart * if the non-blocking acquire fails. */ if (error == 0) { nvp = NULL; error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (tvp) vrele(tvp); tvp = nvp; if (error) { VOP_UNLOCK(fdvp); VOP_UNLOCK(tdvp); VOP_UNLOCK(fvp); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } } if (DOINGSOFTDEP(fdvp)) { error = softdep_prerename(fdvp, fvp, tdvp, tvp); if (error != 0) { if (error == ERELOOKUP) { atomic_add_int(&rename_restarts, 1); goto relock; } goto releout; } } fdp = VTOI(fdvp); fip = VTOI(fvp); tdp = VTOI(tdvp); tip = NULL; if (tvp) tip = VTOI(tvp); if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto unlockout; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. However, things could change after * we drop the locks above. */ if (fvp == tvp) { error = 0; goto unlockout; } doingdirectory = 0; newparent = 0; ino = fip->i_number; if (fip->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto unlockout; } if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdp->i_flags & APPEND)) { error = EPERM; goto unlockout; } if ((fip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || fdp == fip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } if (fdp->i_number != tdp->i_number) newparent = tdp->i_number; doingdirectory = 1; } if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) || (tvp != NULL && tvp->v_type == VDIR && tvp->v_mountedhere != NULL)) { error = EXDEV; goto unlockout; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". */ if (doingdirectory && newparent) { error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); if (error) goto unlockout; error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, &ino); /* * We encountered a lock that we have to wait for. Unlock * everything else and VGET before restarting. */ if (ino) { VOP_UNLOCK(fdvp); VOP_UNLOCK(fvp); VOP_UNLOCK(tdvp); if (tvp) VOP_UNLOCK(tvp); error = VFS_VGET(mp, ino, LK_SHARED, &nvp); if (error == 0) vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } if (error) goto unlockout; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); } if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || tdp->i_effnlink == 0) panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); if (tvp != NULL) vn_seqc_write_begin(tvp); vn_seqc_write_begin(tdvp); vn_seqc_write_begin(fvp); vn_seqc_write_begin(fdvp); want_seqc_end = true; /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ fip->i_effnlink++; fip->i_nlink++; DIP_SET(fip, i_nlink, fip->i_nlink); UFS_INODE_SET_FLAG(fip, IN_CHANGE); if (DOINGSOFTDEP(fvp)) softdep_setup_link(tdp, fip); error = UFS_UPDATE(fvp, !DOINGSOFTDEP(fvp) && !DOINGASYNC(fvp)); if (error) goto bad; /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (tip == NULL) { if (ITODEV(tdp) != ITODEV(fip)) panic("ufs_rename: EXDEV"); if (doingdirectory && newparent) { /* * Account for ".." in new directory. * When source and destination have the same * parent we don't adjust the link count. The * actual link modification is completed when * .. is rewritten below. */ if (tdp->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto bad; } } ufs_makedirentry(fip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); if (error) goto bad; /* Setup tdvp for directory compaction if needed. */ if (I_COUNT(tdp) != 0 && I_ENDOFF(tdp) != 0 && I_ENDOFF(tdp) < tdp->i_size) endoff = I_ENDOFF(tdp); } else { if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (tip->i_number == fip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller * must possess VADMIN for the parent directory, or the * destination of the rename. This implements append-only * directories. */ if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((tip->i_mode & IFMT) == IFDIR) { if ((tip->i_effnlink > 2) || !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } if (doingdirectory) { if (!newparent) { tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink--; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } error = ufs_dirrewrite(tdp, tip, fip->i_number, IFTODT(fip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) { if (doingdirectory) { if (!newparent) { tdp->i_effnlink++; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink++; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } goto bad; } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * The only stuff left in the directory is "." * and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) { tdp->i_nlink--; DIP_SET(tdp, i_nlink, tdp->i_nlink); UFS_INODE_SET_FLAG(tdp, IN_CHANGE); } tip->i_nlink--; DIP_SET(tip, i_nlink, tip->i_nlink); UFS_INODE_SET_FLAG(tip, IN_CHANGE); } } /* * 3) Unlink the source. We have to resolve the path again to * fixup the directory offset and count for ufs_dirremove. */ if (fdvp == tdvp) { error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) panic("ufs_rename: from entry went away!"); if (ino != fip->i_number) panic("ufs_rename: ino mismatch %ju != %ju\n", (uintmax_t)ino, (uintmax_t)fip->i_number); } /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { /* * If tip exists we simply use its link, otherwise we must * add a new one. */ if (tip == NULL) { tdp->i_effnlink++; tdp->i_nlink++; DIP_SET(tdp, i_nlink, tdp->i_nlink); UFS_INODE_SET_FLAG(tdp, IN_CHANGE); if (DOINGSOFTDEP(tdvp)) softdep_setup_dotdot_link(tdp, fip); error = UFS_UPDATE(tdvp, !DOINGSOFTDEP(tdvp) && !DOINGASYNC(tdvp)); /* Don't go to bad here as the new link exists. */ if (error) goto unlockout; } else if (DOINGSUJ(tdvp)) /* Journal must account for each new link. */ softdep_setup_dotdot_link(tdp, fip); SET_I_OFFSET(fip, mastertemplate.dot_reclen); ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); /* * The kern_renameat() looks up the fvp using the DELETE flag, which * causes the removal of the name cache entry for fvp. * As the relookup of the fvp is done in two steps: * ufs_lookup_ino() and then VFS_VGET(), another thread might do a * normal lookup of the from name just before the VFS_VGET() call, * causing the cache entry to be re-instantiated. * * The same issue also applies to tvp if it exists as * otherwise we may have a stale name cache entry for the new * name that references the old i-node if it has other links * or open file descriptors. */ cache_vop_rename(fdvp, fvp, tdvp, tvp, fcnp, tcnp); unlockout: if (want_seqc_end) { if (tvp != NULL) vn_seqc_write_end(tvp); vn_seqc_write_end(tdvp); vn_seqc_write_end(fvp); vn_seqc_write_end(fdvp); } vput(fdvp); vput(fvp); /* * If compaction or fsync was requested do it in * ffs_vput_pair() now that other locks are no longer needed. */ if (error == 0 && endoff != 0) { UFS_INODE_SET_FLAG(tdp, IN_ENDOFF); SET_I_ENDOFF(tdp, endoff); } VOP_VPUT_PAIR(tdvp, &tvp, true); return (error); bad: fip->i_effnlink--; fip->i_nlink--; DIP_SET(fip, i_nlink, fip->i_nlink); UFS_INODE_SET_FLAG(fip, IN_CHANGE); if (DOINGSOFTDEP(fvp)) softdep_revert_link(tdp, fip); goto unlockout; releout: if (want_seqc_end) { if (tvp != NULL) vn_seqc_write_end(tvp); vn_seqc_write_end(tdvp); vn_seqc_write_end(fvp); vn_seqc_write_end(fdvp); } vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp) vrele(tvp); return (error); } #ifdef UFS_ACL static int ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } static int ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp, mode_t child_mode, struct ucred *cred, struct thread *td) { int error; struct acl *parent_aclp, *child_aclp; parent_aclp = acl_alloc(M_WAITOK); child_aclp = acl_alloc(M_WAITOK | M_ZERO); error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td); if (error) goto out; acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp, child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR); error = ufs_setacl_nfs4_internal(tvp, child_aclp, td); if (error) goto out; out: acl_free(parent_aclp); acl_free(child_aclp); return (error); } #endif /* * Mkdir system call */ static int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; int error, dmode; long blkoff; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if (dp->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ if (dp->i_effnlink < 2) { print_bad_link_count("ufs_mkdir", dvp); error = EINVAL; goto out; } if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, NULL); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; vn_seqc_write_begin(tvp); ip = VTOI(tvp); ip->i_gid = dp->i_gid; DIP_SET(ip, i_gid, dp->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; DIP_SET(ip, i_uid, dp->i_uid); #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); UFS_INODE_SET_MODE(ip, dmode); DIP_SET(ip, i_mode, dmode); tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; DIP_SET(dp, i_nlink, dp->i_nlink); UFS_INODE_SET_FLAG(dp, IN_CHANGE); if (DOINGSOFTDEP(dvp)) softdep_setup_mkdir(dp, ip); error = UFS_UPDATE(dvp, !DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ - if (dvp->v_mount->mnt_maxsymlinklen > 0) + if (!OFSFMT(dvp)) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; vnode_pager_setsize(tvp, DIRBLKSIZ); if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, BA_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; DIP_SET(ip, i_size, DIRBLKSIZ); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp))) != 0) { (void)bwrite(bp); goto bad; } /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); bad: if (error == 0) { *ap->a_vpp = tvp; vn_seqc_write_end(tvp); } else { dp->i_effnlink--; dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); UFS_INODE_SET_FLAG(dp, IN_CHANGE); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(tvp)) softdep_revert_mkdir(dp, ip); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); } out: return (error); } /* * Rmdir system call. */ static int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if (dp->i_effnlink <= 2) { if (dp->i_effnlink == 2) print_bad_link_count("ufs_rmdir", dvp); error = EINVAL; goto out; } if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, vp); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ dp->i_effnlink--; ip->i_effnlink--; if (DOINGSOFTDEP(vp)) softdep_setup_rmdir(dp, ip); error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; if (DOINGSOFTDEP(vp)) softdep_revert_rmdir(dp, ip); goto out; } /* * The only stuff left in the directory is "." and "..". The "." * reference is inconsequential since we are quashing it. The soft * dependency code will arrange to do these operations after * the parent directory entry has been deleted on disk, so * when running with that code we avoid doing them now. */ if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); UFS_INODE_SET_FLAG(dp, IN_CHANGE); error = UFS_UPDATE(dvp, 0); ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); } cache_vop_rmdir(dvp, vp); #ifdef UFS_DIRHASH /* Kill any active hash; i_effnlink == 0, so it will not come back. */ if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif out: return (error); } /* * symlink -- make a symbolic link */ static int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; const char *a_target; } */ *ap; { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp, "ufs_symlink"); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); - if (len < vp->v_mount->mnt_maxsymlinklen) { + if (len < VFSTOUFS(vp->v_mount)->um_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, SHORTLINK(ip), len); ip->i_size = len; DIP_SET(ip, i_size, len); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); error = UFS_UPDATE(vp, 0); } else error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target), len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Vnode op for reading directories. */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct buf *bp; struct inode *ip; struct direct *dp, *edp; u_long *cookies; struct dirent dstdp; off_t offset, startoffset; size_t readcnt, skipcnt; ssize_t startresid; u_int ncookies; int error; if (uio->uio_offset < 0) return (EINVAL); ip = VTOI(vp); if (ip->i_effnlink == 0) return (0); if (ap->a_ncookies != NULL) { if (uio->uio_resid < 0) ncookies = 0; else ncookies = uio->uio_resid; if (uio->uio_offset >= ip->i_size) ncookies = 0; else if (ip->i_size - uio->uio_offset < ncookies) ncookies = ip->i_size - uio->uio_offset; ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } offset = startoffset = uio->uio_offset; startresid = uio->uio_resid; error = 0; while (error == 0 && uio->uio_resid > 0 && uio->uio_offset < ip->i_size) { error = UFS_BLKATOFF(vp, uio->uio_offset, NULL, &bp); if (error) break; if (bp->b_offset + bp->b_bcount > ip->i_size) readcnt = ip->i_size - bp->b_offset; else readcnt = bp->b_bcount; skipcnt = (size_t)(uio->uio_offset - bp->b_offset) & ~(size_t)(DIRBLKSIZ - 1); offset = bp->b_offset + skipcnt; dp = (struct direct *)&bp->b_data[skipcnt]; edp = (struct direct *)&bp->b_data[readcnt]; while (error == 0 && uio->uio_resid > 0 && dp < edp) { if (dp->d_reclen <= offsetof(struct direct, d_name) || (caddr_t)dp + dp->d_reclen > (caddr_t)edp) { error = EIO; break; } #if BYTE_ORDER == LITTLE_ENDIAN /* Old filesystem format. */ - if (vp->v_mount->mnt_maxsymlinklen <= 0) { + if (OFSFMT(vp)) { dstdp.d_namlen = dp->d_type; dstdp.d_type = dp->d_namlen; } else #endif { dstdp.d_namlen = dp->d_namlen; dstdp.d_type = dp->d_type; } if (offsetof(struct direct, d_name) + dstdp.d_namlen > dp->d_reclen) { error = EIO; break; } if (offset < startoffset || dp->d_ino == 0) goto nextentry; dstdp.d_fileno = dp->d_ino; dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); /* NOTE: d_off is the offset of the *next* entry. */ dstdp.d_off = offset + dp->d_reclen; dirent_terminate(&dstdp); if (dstdp.d_reclen > uio->uio_resid) { if (uio->uio_resid == startresid) error = EINVAL; else error = EJUSTRETURN; break; } /* Advance dp. */ error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio); if (error) break; if (cookies != NULL) { KASSERT(ncookies > 0, ("ufs_readdir: cookies buffer too small")); *cookies = offset + dp->d_reclen; cookies++; ncookies--; } nextentry: offset += dp->d_reclen; dp = (struct direct *)((caddr_t)dp + dp->d_reclen); } bqrelse(bp); uio->uio_offset = offset; } /* We need to correct uio_offset. */ uio->uio_offset = offset; if (error == EJUSTRETURN) error = 0; if (ap->a_ncookies != NULL) { if (error == 0) { ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (error == 0 && ap->a_eofflag) *ap->a_eofflag = ip->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ static int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); doff_t isize; isize = ip->i_size; - if (isize < vp->v_mount->mnt_maxsymlinklen) + if (isize < VFSTOUFS(vp->v_mount)->um_maxsymlinklen) return (uiomove(SHORTLINK(ip), isize, ap->a_uio)); return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ufs_bmaparray() operation may not * deadlock on memory. See ufs_bmap() for details. */ static int ufs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; ufs2_daddr_t blkno; int error; if (bp->b_blkno == bp->b_lblkno) { error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); printf("\tnlink=%d, effnlink=%d, size=%jd", ip->i_nlink, ip->i_effnlink, (intmax_t)ip->i_size); if (I_IS_UFS2(ip)) printf(", extsize %d", ip->i_din2->di_extsize); printf("\n\tgeneration=%jx, uid=%d, gid=%d, flags=0x%b\n", (uintmax_t)ip->i_gen, ip->i_uid, ip->i_gid, (u_int)ip->i_flags, PRINT_INODE_FLAGS); printf("\tino %lu, on dev %s", (u_long)ip->i_number, devtoname(ITODEV(ip))); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ufs kqfilter routines if needed */ static int ufsfifo_kqfilter(ap) struct vop_kqfilter_args *ap; { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = vfs_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ static int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { int error; error = 0; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = UFS_LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = UFS_MAXNAMLEN; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_NFS4: if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; #endif case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; #else *ap->a_retval = 3; #endif break; #ifdef MAC case _PC_MAC_PRESENT: if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) *ap->a_retval = 1; else *ap->a_retval = 0; break; #endif case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, fifoops, vpp) struct mount *mntp; struct vop_vector *fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp; vp = *vpp; ASSERT_VOP_LOCKED(vp, "ufs_vinit"); ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); /* * Only unallocated inodes should be of type VNON. */ if (ip->i_mode != 0 && vp->v_type == VNON) return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == UFS_ROOTINO) vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Allocate a new inode. * Vnode dvp must be locked. */ static int ufs_makeinode(mode, dvp, vpp, cnp, callfunc) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; const char *callfunc; { struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", callfunc); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; if (pdir->i_effnlink < 2) { print_bad_link_count(callfunc, dvp); return (EINVAL); } if (DOINGSUJ(dvp)) { error = softdep_prelink(dvp, NULL); if (error != 0) { MPASS(error == ERELOOKUP); return (error); } } error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); ip = VTOI(tvp); ip->i_gid = pdir->i_gid; DIP_SET(ip, i_gid, pdir->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; DIP_SET(ip, i_uid, ip->i_uid); mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vgone(tvp); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vgone(tvp); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ vn_seqc_write_begin(tvp); /* Mostly to cover asserts */ UFS_INODE_SET_FLAG(ip, IN_ACCESS | IN_CHANGE | IN_UPDATE); UFS_INODE_SET_MODE(ip, mode); DIP_SET(ip, i_mode, mode); tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID)) { UFS_INODE_SET_MODE(ip, ip->i_mode & ~ISGID); DIP_SET(ip, i_mode, ip->i_mode); } if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); if (error) goto bad; vn_seqc_write_end(tvp); *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (DOINGSOFTDEP(tvp)) softdep_revert_create(VTOI(dvp), ip); vn_seqc_write_end(tvp); vgone(tvp); vput(tvp); return (error); } static int ufs_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp; int error; vp = ap->a_vp; switch (ap->a_command) { case FIOSEEKDATA: error = vn_lock(vp, LK_SHARED); if (error == 0) { error = ufs_bmap_seekdata(vp, (off_t *)ap->a_data); VOP_UNLOCK(vp); } else error = EBADF; return (error); case FIOSEEKHOLE: return (vn_bmap_seekhole(vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } static int ufs_read_pgcache(struct vop_read_pgcache_args *ap) { struct uio *uio; struct vnode *vp; uio = ap->a_uio; vp = ap->a_vp; VNPASS((vn_irflag_read(vp) & VIRF_PGREAD) != 0, vp); if (uio->uio_resid > ptoa(io_hold_cnt) || uio->uio_offset < 0 || (ap->a_ioflag & IO_DIRECT) != 0) return (EJUSTRETURN); return (vn_read_from_obj(vp, uio)); } /* Global vfs data structures for ufs. */ struct vop_vector ufs_vnodeops = { .vop_default = &default_vnodeops, .vop_fsync = VOP_PANIC, .vop_read = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_write = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_bmap = ufs_bmap, .vop_fplookup_vexec = ufs_fplookup_vexec, .vop_fplookup_symlink = VOP_EAGAIN, .vop_cachedlookup = ufs_lookup, .vop_close = ufs_close, .vop_create = ufs_create, .vop_stat = ufs_stat, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_ioctl = ufs_ioctl, .vop_link = ufs_link, .vop_lookup = vfs_cache_lookup, .vop_mmapped = ufs_mmapped, .vop_mkdir = ufs_mkdir, .vop_mknod = ufs_mknod, .vop_need_inactive = ufs_need_inactive, .vop_open = ufs_open, .vop_pathconf = ufs_pathconf, .vop_poll = vop_stdpoll, .vop_print = ufs_print, .vop_read_pgcache = ufs_read_pgcache, .vop_readdir = ufs_readdir, .vop_readlink = ufs_readlink, .vop_reclaim = ufs_reclaim, .vop_remove = ufs_remove, .vop_rename = ufs_rename, .vop_rmdir = ufs_rmdir, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_strategy = ufs_strategy, .vop_symlink = ufs_symlink, .vop_whiteout = ufs_whiteout, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; VFS_VOP_VECTOR_REGISTER(ufs_vnodeops); struct vop_vector ufs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_close = ufsfifo_close, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_kqfilter = ufsfifo_kqfilter, .vop_pathconf = ufs_pathconf, .vop_print = ufs_print, .vop_read = VOP_PANIC, .vop_reclaim = ufs_reclaim, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_write = VOP_PANIC, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; VFS_VOP_VECTOR_REGISTER(ufs_fifoops); diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index 57e163c11d77..e5f9c93da373 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -1,195 +1,200 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufsmount.h 8.6 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _UFS_UFS_UFSMOUNT_H_ #define _UFS_UFS_UFSMOUNT_H_ /* * Arguments to mount UFS-based filesystems */ struct ufs_args { char *fspec; /* block special device to mount */ struct oexport_args export; /* network export information */ }; #ifdef _KERNEL #include #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_UFSMNT); MALLOC_DECLARE(M_TRIM); #endif struct buf; struct inode; struct nameidata; struct taskqueue; struct timeval; struct ucred; struct uio; struct vnode; struct ufs_extattr_per_mount; struct jblocks; struct inodedep; TAILQ_HEAD(inodedeplst, inodedep); LIST_HEAD(bmsafemaphd, bmsafemap); LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params); struct fsfail_task { struct task task; fsid_t fsid; }; /* * This structure describes the UFS specific mount structure data. * The function operators are used to support different versions of * UFS (UFS1, UFS2, etc). * * Lock reference: * c - set at allocation then constant until freed * i - ufsmount interlock (UFS_LOCK / UFS_UNLOCK) * q - associated quota file is locked * r - ref to parent mount structure is held (vfs_busy / vfs_unbusy) * u - managed by user process fsck_ufs */ struct ufsmount { struct mount *um_mountp; /* (r) filesystem vfs struct */ struct cdev *um_dev; /* (r) device mounted */ struct g_consumer *um_cp; /* (r) GEOM access point */ struct bufobj *um_bo; /* (r) Buffer cache object */ struct vnode *um_odevvp; /* (r) devfs dev vnode */ struct vnode *um_devvp; /* (r) mntfs private vnode */ u_long um_fstype; /* (c) type of filesystem */ struct fs *um_fs; /* (r) pointer to superblock */ struct ufs_extattr_per_mount um_extattr; /* (c) extended attrs */ u_long um_nindir; /* (c) indirect ptrs per blk */ u_long um_bptrtodb; /* (c) indir disk block ptr */ u_long um_seqinc; /* (c) inc between seq blocks */ + uint64_t um_maxsymlinklen; /* (c) max size of short + symlink */ struct mtx um_lock; /* (c) Protects ufsmount & fs */ pid_t um_fsckpid; /* (u) PID can do fsck sysctl */ struct mount_softdeps *um_softdep; /* (c) softdep mgmt structure */ struct vnode *um_quotas[MAXQUOTAS]; /* (q) pointer to quota files */ struct ucred *um_cred[MAXQUOTAS]; /* (q) quota file access cred */ time_t um_btime[MAXQUOTAS]; /* (q) block quota time limit */ time_t um_itime[MAXQUOTAS]; /* (q) inode quota time limit */ char um_qflags[MAXQUOTAS]; /* (i) quota specific flags */ int64_t um_savedmaxfilesize; /* (c) track maxfilesize */ u_int um_flags; /* (i) filesystem flags */ struct timeval um_last_fullmsg; /* (i) last full msg time */ int um_secs_fullmsg; /* (i) seconds since full msg */ struct timeval um_last_integritymsg; /* (i) last integrity msg */ int um_secs_integritymsg; /* (i) secs since integ msg */ u_int um_trim_inflight; /* (i) outstanding trim count */ u_int um_trim_inflight_blks; /* (i) outstanding trim blks */ u_long um_trim_total; /* (i) total trim count */ u_long um_trim_total_blks; /* (i) total trim block count */ struct taskqueue *um_trim_tq; /* (c) trim request queue */ struct trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */ u_long um_trimlisthashsize; /* (i) trim hash table size-1 */ struct fsfail_task *um_fsfail_task; /* (i) task for fsfail cleanup*/ /* (c) - below function ptrs */ int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, struct buf **); int (*um_blkatoff)(struct vnode *, off_t, char **, struct buf **); int (*um_truncate)(struct vnode *, off_t, int, struct ucred *); int (*um_update)(struct vnode *, int); int (*um_valloc)(struct vnode *, int, struct ucred *, struct vnode **); int (*um_vfree)(struct vnode *, ino_t, int); void (*um_ifree)(struct ufsmount *, struct inode *); int (*um_rdonly)(struct inode *); void (*um_snapgone)(struct inode *); int (*um_check_blkno)(struct mount *, ino_t, daddr_t, int); }; /* * filesystem flags */ #define UM_CANDELETE 0x00000001 /* devvp supports TRIM */ #define UM_WRITESUSPENDED 0x00000002 /* suspension in progress */ #define UM_CANSPEEDUP 0x00000004 /* devvp supports SPEEDUP */ #define UM_FSFAIL_CLEANUP 0x00000008 /* need cleanup after unrecoverable error */ /* * function prototypes */ #define UFS_BALLOC(aa, bb, cc, dd, ee, ff) \ VFSTOUFS((aa)->v_mount)->um_balloc(aa, bb, cc, dd, ee, ff) #define UFS_BLKATOFF(aa, bb, cc, dd) \ VFSTOUFS((aa)->v_mount)->um_blkatoff(aa, bb, cc, dd) #define UFS_TRUNCATE(aa, bb, cc, dd) \ VFSTOUFS((aa)->v_mount)->um_truncate(aa, bb, cc, dd) #define UFS_UPDATE(aa, bb) VFSTOUFS((aa)->v_mount)->um_update(aa, bb) #define UFS_VALLOC(aa, bb, cc, dd) \ VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd) #define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc) #define UFS_IFREE(aa, bb) ((aa)->um_ifree(aa, bb)) #define UFS_RDONLY(aa) (ITOUMP(aa)->um_rdonly(aa)) #define UFS_SNAPGONE(aa) (ITOUMP(aa)->um_snapgone(aa)) #define UFS_CHECK_BLKNO(aa, bb, cc, dd) \ (VFSTOUFS(aa)->um_check_blkno == NULL ? 0 : \ VFSTOUFS(aa)->um_check_blkno(aa, bb, cc, dd)) #define UFS_LOCK(aa) mtx_lock(&(aa)->um_lock) #define UFS_UNLOCK(aa) mtx_unlock(&(aa)->um_lock) #define UFS_MTX(aa) (&(aa)->um_lock) /* * Filesystem types */ #define UFS1 1 #define UFS2 2 /* * Flags describing the state of quotas. */ #define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ #define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ #define QTF_64BIT 0x04 /* 64-bit quota file */ /* Convert mount ptr to ufsmount ptr. */ #define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) #define UFSTOVFS(ump) (ump)->um_mountp /* * Macros to access filesystem parameters in the ufsmount structure. * Used by ufs_bmap. */ #define MNINDIR(ump) ((ump)->um_nindir) #define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) #define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) #endif /* _KERNEL */ +/* true if old FS format...*/ +#define OFSFMT(vp) (VFSTOUFS((vp)->v_mount)->um_maxsymlinklen <= 0) + #endif