Index: stable/10/sys/fs/ext2fs/ext2_vfsops.c =================================================================== --- stable/10/sys/fs/ext2fs/ext2_vfsops.c (revision 282269) +++ stable/10/sys/fs/ext2fs/ext2_vfsops.c (revision 282270) @@ -1,1100 +1,1101 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td); static int ext2_mountfs(struct vnode *, struct mount *); static int ext2_reload(struct mount *mp, struct thread *td); static int ext2_sbupdate(struct ext2mount *, int); static int ext2_cgupdate(struct ext2mount *, int); static vfs_unmount_t ext2_unmount; static vfs_root_t ext2_root; static vfs_statfs_t ext2_statfs; static vfs_sync_t ext2_sync; static vfs_vget_t ext2_vget; static vfs_fhtovp_t ext2_fhtovp; static vfs_mount_t ext2_mount; MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part"); static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure"); static struct vfsops ext2fs_vfsops = { .vfs_fhtovp = ext2_fhtovp, .vfs_mount = ext2_mount, .vfs_root = ext2_root, /* root inode via vget */ .vfs_statfs = ext2_statfs, .vfs_sync = ext2_sync, .vfs_unmount = ext2_unmount, .vfs_vget = ext2_vget, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly); static int compute_sb_data(struct vnode * devvp, struct ext2fs * es, struct m_ext2fs * fs); static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "multilabel", "suiddir", "nosymfollow", "sync", "union", NULL }; /* * VFS Operations. * * mount system call */ static int ext2_mount(struct mount *mp) { struct vfsoptlist *opts; struct vnode *devvp; struct thread *td; struct ext2mount *ump = NULL; struct m_ext2fs *fs; struct nameidata nd, *ndp = &nd; accmode_t accmode; char *path, *fspec; int error, flags, len; td = curthread; opts = mp->mnt_optnew; if (vfs_filteropt(opts, ext2_opts)) return (EINVAL); vfs_getopt(opts, "fspath", (void **)&path, NULL); /* Double-check the length of path.. */ if (strlen(path) >= MAXMNTLEN) return (ENAMETOOLONG); fspec = NULL; error = vfs_getopt(opts, "from", (void **)&fspec, &len); if (!error && fspec[len - 1] != '\0') return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOEXT2(mp); fs = ump->um_e2fs; error = 0; if (fs->e2fs_ronly == 0 && vfs_flagopt(opts, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = ext2_flushfiles(mp, flags, td); if ( error == 0 && fs->e2fs_wasvalid && ext2_cgupdate(ump, MNT_WAIT) == 0) { fs->e2fs->e2fs_state |= E2FS_ISCLEAN; ext2_sbupdate(ump, MNT_WAIT); } fs->e2fs_ronly = 1; vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY); DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, td); if (error) return (error); devvp = ump->um_devvp; if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) { if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0)) return (EPERM); /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } VOP_UNLOCK(devvp, 0); DROP_GIANT(); g_topology_lock(); error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((fs->e2fs->e2fs_state & E2FS_ISCLEAN) == 0 || (fs->e2fs->e2fs_state & E2FS_ERRORS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->e2fs_fsmnt); return (EPERM); } } fs->e2fs->e2fs_state &= ~E2FS_ISCLEAN; (void)ext2_cgupdate(ump, MNT_WAIT); fs->e2fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } if (vfs_flagopt(opts, "export", NULL, 0)) { /* Process export requests in vfs_mount.c. */ return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (fspec == NULL) return (EINVAL); NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. * * XXXRW: VOP_ACCESS() enough? */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = ext2_mountfs(devvp, mp); } else { if (devvp != ump->um_devvp) { vput(devvp); return (EINVAL); /* needs translation */ } else vput(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOEXT2(mp); fs = ump->um_e2fs; /* * Note that this strncpy() is ok because of a check at the start * of ext2_mount(). */ strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN); fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0'; vfs_mountedfrom(mp, fspec); return (0); } static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly) { if (es->e2fs_magic != E2FS_MAGIC) { printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n", devtoname(dev), es->e2fs_magic, E2FS_MAGIC); return (1); } if (es->e2fs_rev > E2FS_REV0) { if (es->e2fs_features_incompat & ~(EXT2F_INCOMPAT_SUPP | EXT4F_RO_INCOMPAT_SUPP)) { printf( "WARNING: mount of %s denied due to unsupported optional features\n", devtoname(dev)); return (1); } if (!ronly && (es->e2fs_features_rocompat & ~EXT2F_ROCOMPAT_SUPP)) { printf("WARNING: R/W mount of %s denied due to " "unsupported optional features\n", devtoname(dev)); return (1); } } return (0); } /* * This computes the fields of the ext2_sb_info structure from the * data in the ext2_super_block structure read in. */ static int compute_sb_data(struct vnode *devvp, struct ext2fs *es, struct m_ext2fs *fs) { int db_count, error; int i; int logic_sb_block = 1; /* XXX for now */ struct buf *bp; uint32_t e2fs_descpb; fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->e2fs_log_bsize; fs->e2fs_bsize = 1U << fs->e2fs_bshift; fs->e2fs_fsbtodb = es->e2fs_log_bsize + 1; fs->e2fs_qbmask = fs->e2fs_bsize - 1; fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << es->e2fs_log_fsize; if (fs->e2fs_fsize) fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize; fs->e2fs_bpg = es->e2fs_bpg; fs->e2fs_fpg = es->e2fs_fpg; fs->e2fs_ipg = es->e2fs_ipg; if (es->e2fs_rev == E2FS_REV0) { fs->e2fs_isize = E2FS_REV0_INODE_SIZE ; } else { fs->e2fs_isize = es->e2fs_inode_size; /* * Simple sanity check for superblock inode size value. */ if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE || EXT2_INODE_SIZE(fs) > fs->e2fs_bsize || (fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) { printf("ext2fs: invalid inode size %d\n", fs->e2fs_isize); return (EIO); } } /* Check for extra isize in big inodes. */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) && EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) { printf("ext2fs: no space for extra inode timestamps\n"); return (EINVAL); } fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs); fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb; /* s_resuid / s_resgid ? */ fs->e2fs_gcount = (es->e2fs_bcount - es->e2fs_first_dblock + EXT2_BLOCKS_PER_GROUP(fs) - 1) / EXT2_BLOCKS_PER_GROUP(fs); e2fs_descpb = fs->e2fs_bsize / sizeof(struct ext2_gd); db_count = (fs->e2fs_gcount + e2fs_descpb - 1) / e2fs_descpb; fs->e2fs_gdbcount = db_count; fs->e2fs_gd = malloc(db_count * fs->e2fs_bsize, M_EXT2MNT, M_WAITOK); fs->e2fs_contigdirs = malloc(fs->e2fs_gcount * sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK | M_ZERO); /* * Adjust logic_sb_block. * Godmar thinks: if the blocksize is greater than 1024, then * the superblock is logically part of block zero. */ if(fs->e2fs_bsize > SBSIZE) logic_sb_block = 0; for (i = 0; i < db_count; i++) { error = bread(devvp , fsbtodb(fs, logic_sb_block + i + 1 ), fs->e2fs_bsize, NOCRED, &bp); if (error) { free(fs->e2fs_contigdirs, M_EXT2MNT); free(fs->e2fs_gd, M_EXT2MNT); brelse(bp); return (error); } e2fs_cgload((struct ext2_gd *)bp->b_data, &fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], fs->e2fs_bsize); brelse(bp); bp = NULL; } /* Initialization for the ext2 Orlov allocator variant. */ fs->e2fs_total_dir = 0; for (i = 0; i < fs->e2fs_gcount; i++) fs->e2fs_total_dir += fs->e2fs_gd[i].ext2bgd_ndirs; if (es->e2fs_rev == E2FS_REV0 || !EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE)) fs->e2fs_maxfilesize = 0x7fffffff; else fs->e2fs_maxfilesize = 0x7fffffffffffffff; return (0); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) invalidate all cluster summary information. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. * XXX we are missing some steps, in particular # 3, this has to be reviewed. */ static int ext2_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; struct buf *bp; struct ext2fs *es; struct m_ext2fs *fs; struct csum *sump; int error, i; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOEXT2(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ext2_reload: dirty1"); VOP_UNLOCK(devvp, 0); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOEXT2(mp)->um_e2fs; bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs)); if((error = compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return (error); } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); /* * Step 3: invalidate all cluster summary information. */ if (fs->e2fs_contigsumsize > 0) { lp = fs->e2fs_maxcluster; sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) { *lp++ = fs->e2fs_contigsumsize; sump->cs_init = 0; bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1); } } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0); vrele(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ext2_ei2i((struct ext2fs_dinode *) ((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip); brelse(bp); VOP_UNLOCK(vp, 0); vrele(vp); } return (0); } /* * Common code for mount and mountroot. */ static int ext2_mountfs(struct vnode *devvp, struct mount *mp) { struct ext2mount *ump; struct buf *bp; struct m_ext2fs *fs; struct ext2fs *es; struct cdev *dev = devvp->v_rdev; struct g_consumer *cp; struct bufobj *bo; struct csum *sump; int error; int ronly; int i, size; int32_t *lp; int32_t e2fs_maxcontig; ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0); /* XXX: use VOP_ACESS to check FS perms */ DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0); if (error) return (error); /* XXX: should we check for some sectorsize or 512 instead? */ if (((SBSIZE % cp->provider->sectorsize) != 0) || (SBSIZE < cp->provider->sectorsize)) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); PICKUP_GIANT(); return (EINVAL); } bo = &devvp->v_bufobj; bo->bo_private = cp; bo->bo_ops = g_vfs_bufops; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, dev, ronly) != 0) { error = EINVAL; /* XXX needs translation */ goto out; } if ((es->e2fs_state & E2FS_ISCLEAN) == 0 || (es->e2fs_state & E2FS_ERRORS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK | M_ZERO); /* * I don't know whether this is the right strategy. Note that * we dynamically allocate both an ext2_sb_info and an ext2_super_block * while Linux keeps the super block in a locked buffer. */ ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_EXT2MNT, M_WAITOK); ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs), M_EXT2MNT, M_WAITOK); mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF); bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs)); if ((error = compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs))) goto out; /* * Calculate the maximum contiguous blocks and size of cluster summary * array. In FFS this is done by newfs; however, the superblock * in ext2fs doesn't have these variables, so we can calculate * them here. */ e2fs_maxcontig = MAX(1, MAXPHYS / ump->um_e2fs->e2fs_bsize); ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG); if (ump->um_e2fs->e2fs_contigsumsize > 0) { size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t); ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK); size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum); ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK); lp = ump->um_e2fs->e2fs_maxcluster; sump = ump->um_e2fs->e2fs_clustersum; for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) { *lp++ = ump->um_e2fs->e2fs_contigsumsize; sump->cs_init = 0; sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) * sizeof(int32_t), M_EXT2MNT, M_WAITOK | M_ZERO); } } brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->e2fs_ronly = ronly; /* ronly is set according to mnt_flags */ /* * If the fs is not mounted read-only, make sure the super block is * always written back on a sync(). */ fs->e2fs_wasvalid = fs->e2fs->e2fs_state & E2FS_ISCLEAN ? 1 : 0; if (ronly == 0) { fs->e2fs_fmod = 1; /* mark it modified */ fs->e2fs->e2fs_state &= ~E2FS_ISCLEAN; /* set fs invalid */ } mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_bo = &devvp->v_bufobj; ump->um_cp = cp; /* * Setting those two parameters allowed us to use * ufs_bmap w/o changse! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = fs->e2fs->e2fs_log_bsize + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); /* * Initialize filesystem stat information in mount struct. */ MNT_ILOCK(mp); - mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED; + mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | + MNTK_USES_BCACHE; MNT_IUNLOCK(mp); return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { mtx_destroy(EXT2_MTX(ump)); free(ump->um_e2fs->e2fs_gd, M_EXT2MNT); free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT); free(ump->um_e2fs->e2fs, M_EXT2MNT); free(ump->um_e2fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; } return (error); } /* * Unmount system call. */ static int ext2_unmount(struct mount *mp, int mntflags) { struct ext2mount *ump; struct m_ext2fs *fs; struct csum *sump; int error, flags, i, ronly; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, curthread)) != 0) return (error); ump = VFSTOEXT2(mp); fs = ump->um_e2fs; ronly = fs->e2fs_ronly; if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) { if (fs->e2fs_wasvalid) fs->e2fs->e2fs_state |= E2FS_ISCLEAN; ext2_sbupdate(ump, MNT_WAIT); } DROP_GIANT(); g_topology_lock(); g_vfs_close(ump->um_cp); g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) free(sump->cs_sum, M_EXT2MNT); free(fs->e2fs_clustersum, M_EXT2MNT); free(fs->e2fs_maxcluster, M_EXT2MNT); free(fs->e2fs_gd, M_EXT2MNT); free(fs->e2fs_contigdirs, M_EXT2MNT); free(fs->e2fs, M_EXT2MNT); free(fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td) { int error; error = vflush(mp, 0, flags, td); return (error); } /* * Get filesystem statistics. */ int ext2_statfs(struct mount *mp, struct statfs *sbp) { struct ext2mount *ump; struct m_ext2fs *fs; uint32_t overhead, overhead_per_group, ngdb; int i, ngroups; ump = VFSTOEXT2(mp); fs = ump->um_e2fs; if (fs->e2fs->e2fs_magic != E2FS_MAGIC) panic("ext2_statfs"); /* * Compute the overhead (FS structures) */ overhead_per_group = 1 /* block bitmap */ + 1 /* inode bitmap */ + fs->e2fs_itpg; overhead = fs->e2fs->e2fs_first_dblock + fs->e2fs_gcount * overhead_per_group; if (fs->e2fs->e2fs_rev > E2FS_REV0 && fs->e2fs->e2fs_features_rocompat & EXT2F_ROCOMPAT_SPARSESUPER) { for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) { if (cg_has_sb(i)) ngroups++; } } else { ngroups = fs->e2fs_gcount; } ngdb = fs->e2fs_gdbcount; if (fs->e2fs->e2fs_rev > E2FS_REV0 && fs->e2fs->e2fs_features_compat & EXT2F_COMPAT_RESIZE) ngdb += fs->e2fs->e2fs_reserved_ngdb; overhead += ngroups * (1 /* superblock */ + ngdb); sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = fs->e2fs->e2fs_bcount - overhead; sbp->f_bfree = fs->e2fs->e2fs_fbcount; sbp->f_bavail = sbp->f_bfree - fs->e2fs->e2fs_rbcount; sbp->f_files = fs->e2fs->e2fs_icount; sbp->f_ffree = fs->e2fs->e2fs_ficount; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(struct mount *mp, int waitfor) { struct vnode *mvp, *vp; struct thread *td; struct inode *ip; struct ext2mount *ump = VFSTOEXT2(mp); struct m_ext2fs *fs; int error, allerror = 0; td = curthread; fs = ump->um_e2fs; if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->e2fs_fsmnt); panic("ext2_sync: rofs mod"); } /* * Write back each (modified) inode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } if ((error = VOP_FSYNC(vp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(vp, 0); vrele(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp, 0); } /* * Write back modified superblock. */ if (fs->e2fs_fmod != 0) { fs->e2fs_fmod = 0; fs->e2fs->e2fs_wtime = time_second; if ((error = ext2_cgupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { struct m_ext2fs *fs; struct inode *ip; struct ext2mount *ump; struct buf *bp; struct vnode *vp; struct thread *td; int i, error; int used_blocks; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); ump = VFSTOEXT2(mp); ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) { *vpp = NULL; free(ip, M_EXT2NODE); return (error); } vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_ump = ump; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { free(ip, M_EXT2NODE); *vpp = NULL; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ ext2_ei2i((struct ext2fs_dinode *) ((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ino)), ip); ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; /* * Now we want to make sure that block pointers for unused * blocks are zeroed out - ext2_balloc depends on this * although for regular files and directories only * * If IN_E4EXTENTS is enabled, unused blocks are not zeroed * out because we could corrupt the extent tree. */ if (!(ip->i_flag & IN_E4EXTENTS) && (S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode))) { used_blocks = (ip->i_size+fs->e2fs_bsize-1) / fs->e2fs_bsize; for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } #ifdef EXT2FS_DEBUG ext2_print_inode(ip); #endif bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = random() + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct inode *ip; struct ufid *ufhp; struct vnode *nvp; struct m_ext2fs *fs; int error; ufhp = (struct ufid *)fhp; fs = VFSTOEXT2(mp)->um_e2fs; if (ufhp->ufid_ino < EXT2_ROOTINO || ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs->e2fs_ipg) return (ESTALE); error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp); if (error) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, 0, curthread); return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct ext2fs *es = fs->e2fs; struct buf *bp; int error = 0; bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2fs)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here. */ return (error); } int ext2_cgupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct buf *bp; int i, error = 0, allerror = 0; allerror = ext2_sbupdate(mp, waitfor); for (i = 0; i < fs->e2fs_gdbcount; i++) { bp = getblk(mp->um_devvp, fsbtodb(fs, fs->e2fs->e2fs_first_dblock + 1 /* superblock */ + i), fs->e2fs_bsize, 0, 0, 0); e2fs_cgsave(&fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], (struct ext2_gd *)bp->b_data, fs->e2fs_bsize); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); } if (!allerror && error) allerror = error; return (allerror); } /* * Return the root of a filesystem. */ static int ext2_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *nvp; int error; error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp); if (error) return (error); *vpp = nvp; return (0); } Index: stable/10/sys/fs/fuse/fuse_vfsops.c =================================================================== --- stable/10/sys/fs/fuse/fuse_vfsops.c (revision 282269) +++ stable/10/sys/fs/fuse/fuse_vfsops.c (revision 282270) @@ -1,533 +1,534 @@ /* * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_param.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" #include #include #define FUSE_DEBUG_MODULE VFSOPS #include "fuse_debug.h" /* This will do for privilege types for now */ #ifndef PRIV_VFS_FUSE_ALLOWOTHER #define PRIV_VFS_FUSE_ALLOWOTHER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_MOUNT_NONUSER #define PRIV_VFS_FUSE_MOUNT_NONUSER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_SYNC_UNMOUNT #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER #endif static vfs_mount_t fuse_vfsop_mount; static vfs_unmount_t fuse_vfsop_unmount; static vfs_root_t fuse_vfsop_root; static vfs_statfs_t fuse_vfsop_statfs; struct vfsops fuse_vfsops = { .vfs_mount = fuse_vfsop_mount, .vfs_unmount = fuse_vfsop_unmount, .vfs_root = fuse_vfsop_root, .vfs_statfs = fuse_vfsop_statfs, }; SYSCTL_INT(_vfs_fuse, OID_AUTO, init_backgrounded, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 1, "indicate async handshake"); static int fuse_enforce_dev_perms = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, enforce_dev_perms, CTLFLAG_RW, &fuse_enforce_dev_perms, 0, "enforce fuse device permissions for secondary mounts"); static unsigned sync_unmount = 1; SYSCTL_UINT(_vfs_fuse, OID_AUTO, sync_unmount, CTLFLAG_RW, &sync_unmount, 0, "specify when to use synchronous unmount"); MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer"); static int fuse_getdevice(const char *fspec, struct thread *td, struct cdev **fdevp) { struct nameidata nd, *ndp = &nd; struct vnode *devvp; struct cdev *fdev; int err; /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td); if ((err = namei(ndp)) != 0) return err; NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (devvp->v_type != VCHR) { vrele(devvp); return ENXIO; } fdev = devvp->v_rdev; dev_ref(fdev); if (fuse_enforce_dev_perms) { /* * Check if mounter can open the fuse device. * * This has significance only if we are doing a secondary mount * which doesn't involve actually opening fuse devices, but we * still want to enforce the permissions of the device (in * order to keep control over the circle of fuse users). * * (In case of primary mounts, we are either the superuser so * we can do anything anyway, or we can mount only if the * device is already opened by us, ie. we are permitted to open * the device.) */ #if 0 #ifdef MAC err = mac_check_vnode_open(td->td_ucred, devvp, VREAD | VWRITE); if (!err) #endif #endif /* 0 */ err = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (err) { vrele(devvp); dev_rel(fdev); return err; } } /* * according to coda code, no extra lock is needed -- * although in sys/vnode.h this field is marked "v" */ vrele(devvp); if (!fdev->si_devsw || strcmp("fuse", fdev->si_devsw->d_name)) { dev_rel(fdev); return ENXIO; } *fdevp = fdev; return 0; } #define FUSE_FLAGOPT(fnam, fval) do { \ vfs_flagopt(opts, #fnam, &mntopts, fval); \ vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \ } while (0) static int fuse_vfsop_mount(struct mount *mp) { int err; uint64_t mntopts, __mntopts; int max_read_set; uint32_t max_read; int daemon_timeout; int fd; size_t len; struct cdev *fdev; struct fuse_data *data; struct thread *td; struct file *fp, *fptmp; char *fspec, *subtype; struct vfsoptlist *opts; cap_rights_t rights; subtype = NULL; max_read_set = 0; max_read = ~0; err = 0; mntopts = 0; __mntopts = 0; td = curthread; fuse_trace_printf_vfsop(); if (mp->mnt_flag & MNT_UPDATE) return EOPNOTSUPP; MNT_ILOCK(mp); mp->mnt_flag |= MNT_SYNCHRONOUS; mp->mnt_data = NULL; MNT_IUNLOCK(mp); /* Get the new options passed to mount */ opts = mp->mnt_optnew; if (!opts) return EINVAL; /* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */ if (!vfs_getopts(opts, "fspath", &err)) return err; /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ fspec = vfs_getopts(opts, "from", &err); if (!fspec) return err; /* `fd' contains the filedescriptor for this session; REQUIRED */ if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) return EINVAL; err = fuse_getdevice(fspec, td, &fdev); if (err != 0) return err; /* * With the help of underscored options the mount program * can inform us from the flags it sets by default */ FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY); FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN); FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS); FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE); FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD); FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE); FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE); FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP); FUSE_FLAGOPT(brokenio, FSESS_BROKENIO); if (vfs_scanopt(opts, "max_read=", "%u", &max_read) == 1) max_read_set = 1; if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) { if (daemon_timeout < FUSE_MIN_DAEMON_TIMEOUT) daemon_timeout = FUSE_MIN_DAEMON_TIMEOUT; else if (daemon_timeout > FUSE_MAX_DAEMON_TIMEOUT) daemon_timeout = FUSE_MAX_DAEMON_TIMEOUT; } else { daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT; } subtype = vfs_getopts(opts, "subtype=", &err); FS_DEBUG2G("mntopts 0x%jx\n", (uintmax_t)mntopts); err = fget(td, fd, cap_rights_init(&rights, CAP_READ), &fp); if (err != 0) { FS_DEBUG("invalid or not opened device: data=%p\n", data); goto out; } fptmp = td->td_fpop; td->td_fpop = fp; err = devfs_get_cdevpriv((void **)&data); td->td_fpop = fptmp; fdrop(fp, td); FUSE_LOCK(); if (err != 0 || data == NULL || data->mp != NULL) { FS_DEBUG("invalid or not opened device: data=%p data.mp=%p\n", data, data != NULL ? data->mp : NULL); err = ENXIO; FUSE_UNLOCK(); goto out; } if (fdata_get_dead(data)) { FS_DEBUG("device is dead during mount: data=%p\n", data); err = ENOTCONN; FUSE_UNLOCK(); goto out; } /* Sanity + permission checks */ if (!data->daemoncred) panic("fuse daemon found, but identity unknown"); if (mntopts & FSESS_DAEMON_CAN_SPY) err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) /* are we allowed to do the first mount? */ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); if (err) { FUSE_UNLOCK(); goto out; } data->ref++; data->mp = mp; data->dataflags |= mntopts; data->max_read = max_read; data->daemon_timeout = daemon_timeout; FUSE_UNLOCK(); vfs_getnewfsid(mp); MNT_ILOCK(mp); mp->mnt_data = data; mp->mnt_flag |= MNT_LOCAL; + mp->mnt_kern_flag |= MNTK_USES_BCACHE; MNT_IUNLOCK(mp); /* We need this here as this slot is used by getnewvnode() */ mp->mnt_stat.f_iosize = PAGE_SIZE; if (subtype) { strlcat(mp->mnt_stat.f_fstypename, ".", MFSNAMELEN); strlcat(mp->mnt_stat.f_fstypename, subtype, MFSNAMELEN); } copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len); bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len); FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname); /* Now handshaking with daemon */ fuse_internal_send_init(data, td); out: if (err) { FUSE_LOCK(); if (data->mp == mp) { /* * Destroy device only if we acquired reference to * it */ FS_DEBUG("mount failed, destroy device: data=%p mp=%p" " err=%d\n", data, mp, err); data->mp = NULL; fdata_trydestroy(data); } FUSE_UNLOCK(); dev_rel(fdev); } return err; } static int fuse_vfsop_unmount(struct mount *mp, int mntflags) { int err = 0; int flags = 0; struct cdev *fdev; struct fuse_data *data; struct fuse_dispatcher fdi; struct thread *td = curthread; fuse_trace_printf_vfsop(); if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } data = fuse_get_mpdata(mp); if (!data) { panic("no private data for mount point?"); } /* There is 1 extra root vnode reference (mp->mnt_data). */ FUSE_LOCK(); if (data->vroot != NULL) { struct vnode *vroot = data->vroot; data->vroot = NULL; FUSE_UNLOCK(); vrele(vroot); } else FUSE_UNLOCK(); err = vflush(mp, 0, flags, td); if (err) { debug_printf("vflush failed"); return err; } if (fdata_get_dead(data)) { goto alreadydead; } fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); fdata_set_dead(data); alreadydead: FUSE_LOCK(); data->mp = NULL; fdev = data->fdev; fdata_trydestroy(data); FUSE_UNLOCK(); MNT_ILOCK(mp); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); dev_rel(fdev); return 0; } static int fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp) { struct fuse_data *data = fuse_get_mpdata(mp); int err = 0; if (data->vroot != NULL) { err = vget(data->vroot, lkflags, curthread); if (err == 0) *vpp = data->vroot; } else { err = fuse_vnode_get(mp, FUSE_ROOT_ID, NULL, vpp, NULL, VDIR); if (err == 0) { FUSE_LOCK(); MPASS(data->vroot == NULL || data->vroot == *vpp); if (data->vroot == NULL) { FS_DEBUG("new root vnode\n"); data->vroot = *vpp; FUSE_UNLOCK(); vref(*vpp); } else if (data->vroot != *vpp) { FS_DEBUG("root vnode race\n"); FUSE_UNLOCK(); VOP_UNLOCK(*vpp, 0); vrele(*vpp); vrecycle(*vpp); *vpp = data->vroot; } else FUSE_UNLOCK(); } } return err; } static int fuse_vfsop_statfs(struct mount *mp, struct statfs *sbp) { struct fuse_dispatcher fdi; int err = 0; struct fuse_statfs_out *fsfo; struct fuse_data *data; FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname); data = fuse_get_mpdata(mp); if (!(data->dataflags & FSESS_INITED)) goto fake; fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_STATFS, mp, FUSE_ROOT_ID, NULL, NULL); err = fdisp_wait_answ(&fdi); if (err) { fdisp_destroy(&fdi); if (err == ENOTCONN) { /* * We want to seem a legitimate fs even if the daemon * is stiff dead... (so that, eg., we can still do path * based unmounting after the daemon dies). */ goto fake; } return err; } fsfo = fdi.answ; sbp->f_blocks = fsfo->st.blocks; sbp->f_bfree = fsfo->st.bfree; sbp->f_bavail = fsfo->st.bavail; sbp->f_files = fsfo->st.files; sbp->f_ffree = fsfo->st.ffree; /* cast from uint64_t to int64_t */ sbp->f_namemax = fsfo->st.namelen; sbp->f_bsize = fsfo->st.frsize; /* cast from uint32_t to uint64_t */ FS_DEBUG("fuse_statfs_out -- blocks: %llu, bfree: %llu, bavail: %llu, " "fil es: %llu, ffree: %llu, bsize: %i, namelen: %i\n", (unsigned long long)fsfo->st.blocks, (unsigned long long)fsfo->st.bfree, (unsigned long long)fsfo->st.bavail, (unsigned long long)fsfo->st.files, (unsigned long long)fsfo->st.ffree, fsfo->st.bsize, fsfo->st.namelen); fdisp_destroy(&fdi); return 0; fake: sbp->f_blocks = 0; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_namemax = 0; sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE; return 0; } Index: stable/10/sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- stable/10/sys/fs/msdosfs/msdosfs_vfsops.c (revision 282269) +++ stable/10/sys/fs/msdosfs/msdosfs_vfsops.c (revision 282270) @@ -1,1035 +1,1036 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static const char msdosfs_lock_msg[] = "fatlk"; /* Mount options that we support. */ static const char *msdosfs_opts[] = { "async", "noatime", "noclusterr", "noclusterw", "export", "force", "from", "sync", "cs_dos", "cs_local", "cs_win", "dirmask", "gid", "kiconv", "large", "longname", "longnames", "mask", "shortname", "shortnames", "uid", "win95", "nowin95", NULL }; #if 1 /*def PC98*/ /* * XXX - The boot signature formatted by NEC PC-98 DOS looks like a * garbage or a random value :-{ * If you want to use that broken-signatured media, define the * following symbol even though PC/AT. * (ex. mount PC-98 DOS formatted FD on PC/AT) */ #define MSDOSFS_NOCHECKSIG #endif MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table"); struct iconv_functions *msdosfs_iconv; static int update_mp(struct mount *mp, struct thread *td); static int mountmsdosfs(struct vnode *devvp, struct mount *mp); static vfs_fhtovp_t msdosfs_fhtovp; static vfs_mount_t msdosfs_mount; static vfs_root_t msdosfs_root; static vfs_statfs_t msdosfs_statfs; static vfs_sync_t msdosfs_sync; static vfs_unmount_t msdosfs_unmount; /* Maximum length of a character set name (arbitrary). */ #define MAXCSLEN 64 static int update_mp(struct mount *mp, struct thread *td) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); void *dos, *win, *local; int error, v; if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) { if (msdosfs_iconv != NULL) { error = vfs_getopt(mp->mnt_optnew, "cs_win", &win, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_local", &local, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_dos", &dos, NULL); if (!error) { msdosfs_iconv->open(win, local, &pmp->pm_u2w); msdosfs_iconv->open(local, win, &pmp->pm_w2u); msdosfs_iconv->open(dos, local, &pmp->pm_u2d); msdosfs_iconv->open(local, dos, &pmp->pm_d2u); } if (error != 0) return (error); } else { pmp->pm_w2u = NULL; pmp->pm_u2w = NULL; pmp->pm_d2u = NULL; pmp->pm_u2d = NULL; } } if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1) pmp->pm_gid = v; if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1) pmp->pm_uid = v; if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1) pmp->pm_mask = v & ALLPERMS; if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1) pmp->pm_dirmask = v & ALLPERMS; vfs_flagopt(mp->mnt_optnew, "shortname", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "shortnames", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "longname", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "longnames", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "kiconv", &pmp->pm_flags, MSDOSFSMNT_KICONV); if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; else pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else if (!(pmp->pm_flags & (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) { struct vnode *rootvp; /* * Try to divine whether to support Win'95 long filenames */ if (FAT32(pmp)) pmp->pm_flags |= MSDOSFSMNT_LONGNAME; else { if ((error = msdosfs_root(mp, LK_EXCLUSIVE, &rootvp)) != 0) return error; pmp->pm_flags |= findwin95(VTODE(rootvp)) ? MSDOSFSMNT_LONGNAME : MSDOSFSMNT_SHORTNAME; vput(rootvp); } } return 0; } static int msdosfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct msdosfs_args args; struct export_args exp; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); vfs_oexport_conv(&args.export, &exp); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &exp, sizeof(exp)); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "mask", "%d", args.mask); ma = mount_argf(ma, "dirmask", "%d", args.dirmask); ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname"); ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv"); ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN); ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN); ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN); error = kernel_mount(ma, flags); return (error); } /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(struct mount *mp) { struct vnode *devvp; /* vnode for blk device to mount */ struct thread *td; /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; struct nameidata ndp; int error, flags; accmode_t accmode; char *from; td = curthread; if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts)) return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) { /* * Forbid export requests if filesystem has * MSDOSFS_LARGEFS flag set. */ if ((pmp->pm_flags & MSDOSFS_LARGEFS) != 0) { vfs_mount_error(mp, "MSDOSFS_LARGEFS flag set, cannot export"); return (EOPNOTSUPP); } } if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, 0, flags, td); if (error) return (error); /* * Now the volume is clean. Mark it so while the * device is still rw. */ error = markvoldirty(pmp, 0); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* Downgrade the device from rw to ro. */ DROP_GIANT(); g_topology_lock(); error = g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* * Backing out after an error was painful in the * above. Now we are committed to succeeding. */ pmp->pm_fmod = 0; pmp->pm_flags |= MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } VOP_UNLOCK(devvp, 0); DROP_GIANT(); g_topology_lock(); error = g_access(pmp->pm_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); pmp->pm_fmod = 1; pmp->pm_flags &= ~MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); /* Now that the volume is modifiable, mark it dirty. */ error = markvoldirty(pmp, 1); if (error) return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); error = namei(&ndp); if (error) return (error); devvp = ndp.ni_vp; NDFREE(&ndp, NDF_ONLY_PNBUF); if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { vput(devvp); if (devvp != pmp->pm_devvp) return (EINVAL); /* XXX needs translation */ } if (error) { vrele(devvp); return (error); } error = update_mp(mp, td); if (error) { if ((mp->mnt_flag & MNT_UPDATE) == 0) msdosfs_unmount(mp, MNT_FORCE); return error; } if (devvp->v_type == VCHR && devvp->v_rdev != NULL) devvp->v_rdev->si_mountpt = mp; vfs_mountedfrom(mp, from); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(struct vnode *devvp, struct mount *mp) { struct msdosfsmount *pmp; struct buf *bp; struct cdev *dev; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; u_int8_t SecPerClust; u_long clusters; int ronly, error; struct g_consumer *cp; struct bufobj *bo; bp = NULL; /* This and pmp both used in error_exit. */ pmp = NULL; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; dev = devvp->v_rdev; dev_ref(dev); DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0); if (error) goto error_exit; bo = &devvp->v_bufobj; /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. * * NOTE: 8192 is a magic size that works for ffs. */ error = bread(devvp, 0, 8192, NOCRED, &bp); if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; #ifndef MSDOSFS_NOCHECKSIG if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { error = EINVAL; goto error_exit; } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO); pmp->pm_mountp = mp; pmp->pm_cp = cp; pmp->pm_bo = bo; lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0); /* * Initialize ownerships and permissions, since nothing else will * initialize them iff we are mounting root. */ pmp->pm_uid = UID_ROOT; pmp->pm_gid = GID_WHEEL; pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH | S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR; /* * Experimental support for large MS-DOS filesystems. * WARNING: This uses at least 32 bytes of kernel memory (which is not * reclaimed until the FS is unmounted) for each file on disk to map * between the 32-bit inode numbers used by VFS and the 64-bit * pseudo-inode numbers used internally by msdosfs. This is only * safe to use in certain controlled situations (e.g. read-only FS * with less than 1 million files). * Since the mappings do not persist across unmounts (or reboots), these * filesystems are not suitable for exporting through NFS, or any other * application that requires fixed inode numbers. */ vfs_flagopt(mp->mnt_optnew, "large", &pmp->pm_flags, MSDOSFS_LARGEFS); /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); if (pmp->pm_BytesPerSec < DEV_BSIZE) { error = EINVAL; goto error_exit; } pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; /* calculate the ratio of sector size to DEV_BSIZE */ pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE; /* * We don't check pm_Heads nor pm_SecPerTrack, because * these may not be set for EFI file systems. We don't * use these anyway, so we're unaffected if they are * invalid. */ if (!pmp->pm_BytesPerSec || !SecPerClust) { error = EINVAL; goto error_exit; } if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (!(pmp->pm_flags & MSDOSFS_LARGEFS)) { if (pmp->pm_HugeSectors > 0xffffffff / (pmp->pm_BytesPerSec / sizeof(struct direntry)) + 1) { /* * We cannot deal currently with this size of disk * due to fileid limitations (see msdosfs_getattr and * msdosfs_readdir) */ error = EINVAL; vfs_mount_error(mp, "Disk too big, try '-o large' mount option"); goto error_exit; } } if (pmp->pm_RootDirEnts == 0) { if (pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; #ifdef MSDOSFS_DEBUG printf("mountmsdosfs(): bad FAT32 filesystem\n"); #endif goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition * - number of FAT sectors: >= 1 */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < DEV_BSIZE) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_FATsecs == 0) || (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE) ) { error = EINVAL; goto error_exit; } pmp->pm_HugeSectors *= pmp->pm_BlkPerSec; pmp->pm_HiddenSects *= pmp->pm_BlkPerSec; /* XXX not used? */ pmp->pm_FATsecs *= pmp->pm_BlkPerSec; SecPerClust *= pmp->pm_BlkPerSec; pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec; } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry) + DEV_BSIZE - 1) / DEV_BSIZE; /* in blocks */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust + 1; pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE; /* XXX not used? */ if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one fat entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv; if (pmp->pm_maxcluster >= clusters) { #ifdef MSDOSFS_DEBUG printf("Warning: number of clusters (%ld) exceeds FAT " "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters); #endif pmp->pm_maxcluster = clusters - 1; } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * 512; else pmp->pm_fatblocksize = PAGE_SIZE; pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize, pmp->pm_BytesPerSec); pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE; pmp->pm_bnshift = ffs(DEV_BSIZE) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * DEV_BSIZE; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check the fsinfo sector if we have one. Silently fix up our * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff) * or too large. Ignore fp->fsinfree for now, since we need to * read the entire FAT anyway to fill the inuse map. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) { pmp->pm_nxtfree = getulong(fp->fsinxtfree); if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; } else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Finish initializing pmp->pm_nxtfree (just in case the first few * sectors aren't properly reserved in the FAT). This completes * the fixup for fp->fsinxtfree, and fixes up the zero-initialized * value if there is no fsinfo. We will use pmp->pm_nxtfree * internally even if there is no fsinfo. */ if (pmp->pm_nxtfree < CLUST_FIRST) pmp->pm_nxtfree = CLUST_FIRST; /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_devvp = devvp; pmp->pm_dev = dev; /* * Have the inuse map filled in. */ MSDOSFS_LOCK_MP(pmp); error = fillinusemap(pmp); MSDOSFS_UNLOCK_MP(pmp); if (error != 0) goto error_exit; /* * If they want fat updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the fat being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else { if ((error = markvoldirty(pmp, 1)) != 0) { (void)markvoldirty(pmp, 0); goto error_exit; } pmp->pm_fmod = 1; } mp->mnt_data = pmp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; + mp->mnt_kern_flag |= MNTK_USES_BCACHE; MNT_IUNLOCK(mp); if (pmp->pm_flags & MSDOSFS_LARGEFS) msdosfs_fileno_init(mp); return 0; error_exit: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); PICKUP_GIANT(); } if (pmp) { lockdestroy(&pmp->pm_fatlock); if (pmp->pm_inusemap) free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } dev_rel(dev); return (error); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(struct mount *mp, int mntflags) { struct msdosfsmount *pmp; int error, flags; error = flags = 0; pmp = VFSTOMSDOSFS(mp); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) error = msdosfs_sync(mp, MNT_WAIT); if ((mntflags & MNT_FORCE) != 0) flags |= FORCECLOSE; else if (error != 0) return (error); error = vflush(mp, 0, flags, curthread); if (error != 0 && error != ENXIO) return (error); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) { error = markvoldirty(pmp, 0); if (error && error != ENXIO) { (void)markvoldirty(pmp, 1); return (error); } } if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { if (pmp->pm_w2u) msdosfs_iconv->close(pmp->pm_w2u); if (pmp->pm_u2w) msdosfs_iconv->close(pmp->pm_u2w); if (pmp->pm_d2u) msdosfs_iconv->close(pmp->pm_d2u); if (pmp->pm_u2d) msdosfs_iconv->close(pmp->pm_u2d); } #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); VI_LOCK(vp); vn_printf(vp, "msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("freef %p, freeb %p, mount %p\n", TAILQ_NEXT(vp, v_actfreelist), vp->v_actfreelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd), TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd), vp->v_bufobj.bo_numoutput, vp->v_type); VI_UNLOCK(vp); BO_UNLOCK(bo); } #endif DROP_GIANT(); if (pmp->pm_devvp->v_type == VCHR && pmp->pm_devvp->v_rdev != NULL) pmp->pm_devvp->v_rdev->si_mountpt = NULL; g_topology_lock(); g_vfs_close(pmp->pm_cp); g_topology_unlock(); PICKUP_GIANT(); vrele(pmp->pm_devvp); dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); if (pmp->pm_flags & MSDOSFS_LARGEFS) msdosfs_fileno_free(mp); lockdestroy(&pmp->pm_fatlock); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } static int msdosfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_statfs(struct mount *mp, struct statfs *sbp) { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_maxcluster + 1; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ return (0); } /* * If we have an FSInfo block, update it. */ static int msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor) { struct fsinfo *fp; struct buf *bp; int error; MSDOSFS_LOCK_MP(pmp); if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) { error = 0; goto unlock; } error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp); if (error != 0) { brelse(bp); goto unlock; } fp = (struct fsinfo *)bp->b_data; putulong(fp->fsinfree, pmp->pm_freeclustercount); putulong(fp->fsinxtfree, pmp->pm_nxtfree); pmp->pm_flags &= ~MSDOSFS_FSIMOD; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); unlock: MSDOSFS_UNLOCK_MP(pmp); return (error); } static int msdosfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *nvp; struct thread *td; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; td = curthread; /* * If we ever switch to not updating all of the fats all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update fats here */ } } /* * Write back each (modified) denode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, nvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } dep = VTODE(vp); if ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { if (error == ENOENT) goto loop; continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp, 0); vrele(vp); } /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(pmp->pm_devvp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp, 0); } error = msdosfs_fsiflush(pmp, waitfor); if (error != 0) allerror = error; return (allerror); } static int msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; int error; error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); vnode_create_vobject(*vpp, dep->de_FileSize, curthread); return (0); } static struct vfsops msdosfs_vfsops = { .vfs_fhtovp = msdosfs_fhtovp, .vfs_mount = msdosfs_mount, .vfs_cmount = msdosfs_cmount, .vfs_root = msdosfs_root, .vfs_statfs = msdosfs_statfs, .vfs_sync = msdosfs_sync, .vfs_unmount = msdosfs_unmount, }; VFS_SET(msdosfs_vfsops, msdosfs, 0); MODULE_VERSION(msdosfs, 1); Index: stable/10/sys/fs/nandfs/nandfs_vfsops.c =================================================================== --- stable/10/sys/fs/nandfs/nandfs_vfsops.c (revision 282269) +++ stable/10/sys/fs/nandfs/nandfs_vfsops.c (revision 282270) @@ -1,1591 +1,1592 @@ /*- * Copyright (c) 2010-2012 Semihalf * Copyright (c) 2008, 2009 Reinoud Zandijk * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * From: NetBSD: nilfs_vfsops.c,v 1.1 2009/07/18 16:31:42 reinoud Exp */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NANDFSMNT, "nandfs_mount", "NANDFS mount structure"); #define NANDFS_SET_SYSTEMFILE(vp) { \ (vp)->v_vflag |= VV_SYSTEM; \ vref(vp); \ vput(vp); } #define NANDFS_UNSET_SYSTEMFILE(vp) { \ VOP_LOCK(vp, LK_EXCLUSIVE); \ MPASS(vp->v_bufobj.bo_dirty.bv_cnt == 0); \ (vp)->v_vflag &= ~VV_SYSTEM; \ vgone(vp); \ vput(vp); } /* Globals */ struct _nandfs_devices nandfs_devices; /* Parameters */ int nandfs_verbose = 0; static void nandfs_tunable_init(void *arg) { TUNABLE_INT_FETCH("vfs.nandfs.verbose", &nandfs_verbose); } SYSINIT(nandfs_tunables, SI_SUB_VFS, SI_ORDER_ANY, nandfs_tunable_init, NULL); static SYSCTL_NODE(_vfs, OID_AUTO, nandfs, CTLFLAG_RD, 0, "NAND filesystem"); static SYSCTL_NODE(_vfs_nandfs, OID_AUTO, mount, CTLFLAG_RD, 0, "NANDFS mountpoints"); SYSCTL_INT(_vfs_nandfs, OID_AUTO, verbose, CTLFLAG_RW, &nandfs_verbose, 0, ""); #define NANDFS_CONSTR_INTERVAL 5 int nandfs_sync_interval = NANDFS_CONSTR_INTERVAL; /* sync every 5 seconds */ SYSCTL_UINT(_vfs_nandfs, OID_AUTO, sync_interval, CTLFLAG_RW, &nandfs_sync_interval, 0, ""); #define NANDFS_MAX_DIRTY_SEGS 5 int nandfs_max_dirty_segs = NANDFS_MAX_DIRTY_SEGS; /* sync when 5 dirty seg */ SYSCTL_UINT(_vfs_nandfs, OID_AUTO, max_dirty_segs, CTLFLAG_RW, &nandfs_max_dirty_segs, 0, ""); #define NANDFS_CPS_BETWEEN_SBLOCKS 5 int nandfs_cps_between_sblocks = NANDFS_CPS_BETWEEN_SBLOCKS; /* write superblock every 5 checkpoints */ SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cps_between_sblocks, CTLFLAG_RW, &nandfs_cps_between_sblocks, 0, ""); #define NANDFS_CLEANER_ENABLE 1 int nandfs_cleaner_enable = NANDFS_CLEANER_ENABLE; SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_enable, CTLFLAG_RW, &nandfs_cleaner_enable, 0, ""); #define NANDFS_CLEANER_INTERVAL 5 int nandfs_cleaner_interval = NANDFS_CLEANER_INTERVAL; SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_interval, CTLFLAG_RW, &nandfs_cleaner_interval, 0, ""); #define NANDFS_CLEANER_SEGMENTS 5 int nandfs_cleaner_segments = NANDFS_CLEANER_SEGMENTS; SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_segments, CTLFLAG_RW, &nandfs_cleaner_segments, 0, ""); static int nandfs_mountfs(struct vnode *devvp, struct mount *mp); static vfs_mount_t nandfs_mount; static vfs_root_t nandfs_root; static vfs_statfs_t nandfs_statfs; static vfs_unmount_t nandfs_unmount; static vfs_vget_t nandfs_vget; static vfs_sync_t nandfs_sync; static const char *nandfs_opts[] = { "snap", "from", "noatime", NULL }; /* System nodes */ static int nandfs_create_system_nodes(struct nandfs_device *nandfsdev) { int error; error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_DAT_INO, &nandfsdev->nd_super_root.sr_dat, &nandfsdev->nd_dat_node); if (error) goto errorout; error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_CPFILE_INO, &nandfsdev->nd_super_root.sr_cpfile, &nandfsdev->nd_cp_node); if (error) goto errorout; error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_SUFILE_INO, &nandfsdev->nd_super_root.sr_sufile, &nandfsdev->nd_su_node); if (error) goto errorout; error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_GC_INO, NULL, &nandfsdev->nd_gc_node); if (error) goto errorout; NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node)); NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node)); NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node)); NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node)); DPRINTF(VOLUMES, ("System vnodes: dat: %p cp: %p su: %p\n", NTOV(nandfsdev->nd_dat_node), NTOV(nandfsdev->nd_cp_node), NTOV(nandfsdev->nd_su_node))); return (0); errorout: nandfs_dispose_node(&nandfsdev->nd_gc_node); nandfs_dispose_node(&nandfsdev->nd_dat_node); nandfs_dispose_node(&nandfsdev->nd_cp_node); nandfs_dispose_node(&nandfsdev->nd_su_node); return (error); } static void nandfs_release_system_nodes(struct nandfs_device *nandfsdev) { if (!nandfsdev) return; if (nandfsdev->nd_refcnt > 0) return; if (nandfsdev->nd_gc_node) NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node)); if (nandfsdev->nd_dat_node) NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node)); if (nandfsdev->nd_cp_node) NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node)); if (nandfsdev->nd_su_node) NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node)); } static int nandfs_check_fsdata_crc(struct nandfs_fsdata *fsdata) { uint32_t fsdata_crc, comp_crc; if (fsdata->f_magic != NANDFS_FSDATA_MAGIC) return (0); /* Preserve CRC */ fsdata_crc = fsdata->f_sum; /* Calculate */ fsdata->f_sum = (0); comp_crc = crc32((uint8_t *)fsdata, fsdata->f_bytes); /* Restore */ fsdata->f_sum = fsdata_crc; /* Check CRC */ return (fsdata_crc == comp_crc); } static int nandfs_check_superblock_crc(struct nandfs_fsdata *fsdata, struct nandfs_super_block *super) { uint32_t super_crc, comp_crc; /* Check super block magic */ if (super->s_magic != NANDFS_SUPER_MAGIC) return (0); /* Preserve CRC */ super_crc = super->s_sum; /* Calculate */ super->s_sum = (0); comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes); /* Restore */ super->s_sum = super_crc; /* Check CRC */ return (super_crc == comp_crc); } static void nandfs_calc_superblock_crc(struct nandfs_fsdata *fsdata, struct nandfs_super_block *super) { uint32_t comp_crc; /* Calculate */ super->s_sum = 0; comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes); /* Restore */ super->s_sum = comp_crc; } static int nandfs_is_empty(u_char *area, int size) { int i; for (i = 0; i < size; i++) if (area[i] != 0xff) return (0); return (1); } static __inline int nandfs_sblocks_in_esize(struct nandfs_device *fsdev) { return ((fsdev->nd_erasesize - NANDFS_SBLOCK_OFFSET_BYTES) / sizeof(struct nandfs_super_block)); } static __inline int nandfs_max_sblocks(struct nandfs_device *fsdev) { return (NANDFS_NFSAREAS * nandfs_sblocks_in_esize(fsdev)); } static __inline int nandfs_sblocks_in_block(struct nandfs_device *fsdev) { return (fsdev->nd_devblocksize / sizeof(struct nandfs_super_block)); } #if 0 static __inline int nandfs_sblocks_in_first_block(struct nandfs_device *fsdev) { int n; n = nandfs_sblocks_in_block(fsdev) - NANDFS_SBLOCK_OFFSET_BYTES / sizeof(struct nandfs_super_block); if (n < 0) n = 0; return (n); } #endif static int nandfs_write_superblock_at(struct nandfs_device *fsdev, struct nandfs_fsarea *fstp) { struct nandfs_super_block *super, *supert; struct buf *bp; int sb_per_sector, sbs_in_fsd, read_block; int index, pos, error; off_t offset; DPRINTF(SYNC, ("%s: last_used %d nandfs_sblocks_in_esize %d\n", __func__, fstp->last_used, nandfs_sblocks_in_esize(fsdev))); if (fstp->last_used == nandfs_sblocks_in_esize(fsdev) - 1) index = 0; else index = fstp->last_used + 1; super = &fsdev->nd_super; supert = NULL; sb_per_sector = nandfs_sblocks_in_block(fsdev); sbs_in_fsd = sizeof(struct nandfs_fsdata) / sizeof(struct nandfs_super_block); index += sbs_in_fsd; offset = fstp->offset; DPRINTF(SYNC, ("%s: offset %#jx s_last_pseg %#jx s_last_cno %#jx " "s_last_seq %#jx wtime %jd index %d\n", __func__, offset, super->s_last_pseg, super->s_last_cno, super->s_last_seq, super->s_wtime, index)); read_block = btodb(offset + ((index / sb_per_sector) * sb_per_sector) * sizeof(struct nandfs_super_block)); DPRINTF(SYNC, ("%s: read_block %#x\n", __func__, read_block)); if (index == sbs_in_fsd) { error = nandfs_erase(fsdev, offset, fsdev->nd_erasesize); if (error) return (error); error = bread(fsdev->nd_devvp, btodb(offset), fsdev->nd_devblocksize, NOCRED, &bp); if (error) { printf("NANDFS: couldn't read initial data: %d\n", error); brelse(bp); return (error); } memcpy(bp->b_data, &fsdev->nd_fsdata, sizeof(fsdev->nd_fsdata)); /* * 0xff-out the rest. This bp could be cached, so potentially * b_data contains stale super blocks. * * We don't mind cached bp since most of the time we just add * super blocks to already 0xff-out b_data and don't need to * perform actual read. */ if (fsdev->nd_devblocksize > sizeof(fsdev->nd_fsdata)) memset(bp->b_data + sizeof(fsdev->nd_fsdata), 0xff, fsdev->nd_devblocksize - sizeof(fsdev->nd_fsdata)); error = bwrite(bp); if (error) { printf("NANDFS: cannot rewrite initial data at %jx\n", offset); return (error); } } error = bread(fsdev->nd_devvp, read_block, fsdev->nd_devblocksize, NOCRED, &bp); if (error) { brelse(bp); return (error); } supert = (struct nandfs_super_block *)(bp->b_data); pos = index % sb_per_sector; DPRINTF(SYNC, ("%s: storing at %d\n", __func__, pos)); memcpy(&supert[pos], super, sizeof(struct nandfs_super_block)); /* * See comment above in code that performs erase. */ if (pos == 0) memset(&supert[1], 0xff, (sb_per_sector - 1) * sizeof(struct nandfs_super_block)); error = bwrite(bp); if (error) { printf("NANDFS: cannot update superblock at %jx\n", offset); return (error); } DPRINTF(SYNC, ("%s: fstp->last_used %d -> %d\n", __func__, fstp->last_used, index - sbs_in_fsd)); fstp->last_used = index - sbs_in_fsd; return (0); } int nandfs_write_superblock(struct nandfs_device *fsdev) { struct nandfs_super_block *super; struct timespec ts; int error; int i, j; vfs_timestamp(&ts); super = &fsdev->nd_super; super->s_last_pseg = fsdev->nd_last_pseg; super->s_last_cno = fsdev->nd_last_cno; super->s_last_seq = fsdev->nd_seg_sequence; super->s_wtime = ts.tv_sec; nandfs_calc_superblock_crc(&fsdev->nd_fsdata, super); error = 0; for (i = 0, j = fsdev->nd_last_fsarea; i < NANDFS_NFSAREAS; i++, j = (j + 1 % NANDFS_NFSAREAS)) { if (fsdev->nd_fsarea[j].flags & NANDFS_FSSTOR_FAILED) { DPRINTF(SYNC, ("%s: skipping %d\n", __func__, j)); continue; } error = nandfs_write_superblock_at(fsdev, &fsdev->nd_fsarea[j]); if (error) { printf("NANDFS: writing superblock at offset %d failed:" "%d\n", j * fsdev->nd_erasesize, error); fsdev->nd_fsarea[j].flags |= NANDFS_FSSTOR_FAILED; } else break; } if (i == NANDFS_NFSAREAS) { printf("NANDFS: superblock was not written\n"); /* * TODO: switch to read-only? */ return (error); } else fsdev->nd_last_fsarea = (j + 1) % NANDFS_NFSAREAS; return (0); } static int nandfs_select_fsdata(struct nandfs_device *fsdev, struct nandfs_fsdata *fsdatat, struct nandfs_fsdata **fsdata, int nfsds) { int i; *fsdata = NULL; for (i = 0; i < nfsds; i++) { DPRINTF(VOLUMES, ("%s: i %d f_magic %x f_crc %x\n", __func__, i, fsdatat[i].f_magic, fsdatat[i].f_sum)); if (!nandfs_check_fsdata_crc(&fsdatat[i])) continue; *fsdata = &fsdatat[i]; break; } return (*fsdata != NULL ? 0 : EINVAL); } static int nandfs_select_sb(struct nandfs_device *fsdev, struct nandfs_super_block *supert, struct nandfs_super_block **super, int nsbs) { int i; *super = NULL; for (i = 0; i < nsbs; i++) { if (!nandfs_check_superblock_crc(&fsdev->nd_fsdata, &supert[i])) continue; DPRINTF(SYNC, ("%s: i %d s_last_cno %jx s_magic %x " "s_wtime %jd\n", __func__, i, supert[i].s_last_cno, supert[i].s_magic, supert[i].s_wtime)); if (*super == NULL || supert[i].s_last_cno > (*super)->s_last_cno) *super = &supert[i]; } return (*super != NULL ? 0 : EINVAL); } static int nandfs_read_structures_at(struct nandfs_device *fsdev, struct nandfs_fsarea *fstp, struct nandfs_fsdata *fsdata, struct nandfs_super_block *super) { struct nandfs_super_block *tsuper, *tsuperd; struct buf *bp; int error, read_size; int i; int offset; offset = fstp->offset; if (fsdev->nd_erasesize > MAXBSIZE) read_size = MAXBSIZE; else read_size = fsdev->nd_erasesize; error = bread(fsdev->nd_devvp, btodb(offset), read_size, NOCRED, &bp); if (error) { printf("couldn't read: %d\n", error); brelse(bp); fstp->flags |= NANDFS_FSSTOR_FAILED; return (error); } tsuper = super; memcpy(fsdata, bp->b_data, sizeof(struct nandfs_fsdata)); memcpy(tsuper, (bp->b_data + sizeof(struct nandfs_fsdata)), read_size - sizeof(struct nandfs_fsdata)); brelse(bp); tsuper += (read_size - sizeof(struct nandfs_fsdata)) / sizeof(struct nandfs_super_block); for (i = 1; i < fsdev->nd_erasesize / read_size; i++) { error = bread(fsdev->nd_devvp, btodb(offset + i * read_size), read_size, NOCRED, &bp); if (error) { printf("couldn't read: %d\n", error); brelse(bp); fstp->flags |= NANDFS_FSSTOR_FAILED; return (error); } memcpy(tsuper, bp->b_data, read_size); tsuper += read_size / sizeof(struct nandfs_super_block); brelse(bp); } tsuper -= 1; fstp->last_used = nandfs_sblocks_in_esize(fsdev) - 1; for (tsuperd = super - 1; (tsuper != tsuperd); tsuper -= 1) { if (nandfs_is_empty((u_char *)tsuper, sizeof(*tsuper))) fstp->last_used--; else break; } DPRINTF(VOLUMES, ("%s: last_used %d\n", __func__, fstp->last_used)); return (0); } static int nandfs_read_structures(struct nandfs_device *fsdev) { struct nandfs_fsdata *fsdata, *fsdatat; struct nandfs_super_block *sblocks, *ssblock; int nsbs, nfsds, i; int error = 0; int nrsbs; nfsds = NANDFS_NFSAREAS; nsbs = nandfs_max_sblocks(fsdev); fsdatat = malloc(sizeof(struct nandfs_fsdata) * nfsds, M_NANDFSTEMP, M_WAITOK | M_ZERO); sblocks = malloc(sizeof(struct nandfs_super_block) * nsbs, M_NANDFSTEMP, M_WAITOK | M_ZERO); nrsbs = 0; for (i = 0; i < NANDFS_NFSAREAS; i++) { fsdev->nd_fsarea[i].offset = i * fsdev->nd_erasesize; error = nandfs_read_structures_at(fsdev, &fsdev->nd_fsarea[i], &fsdatat[i], sblocks + nrsbs); if (error) continue; nrsbs += (fsdev->nd_fsarea[i].last_used + 1); if (fsdev->nd_fsarea[fsdev->nd_last_fsarea].last_used > fsdev->nd_fsarea[i].last_used) fsdev->nd_last_fsarea = i; } if (nrsbs == 0) { printf("nandfs: no valid superblocks found\n"); error = EINVAL; goto out; } error = nandfs_select_fsdata(fsdev, fsdatat, &fsdata, nfsds); if (error) goto out; memcpy(&fsdev->nd_fsdata, fsdata, sizeof(struct nandfs_fsdata)); error = nandfs_select_sb(fsdev, sblocks, &ssblock, nsbs); if (error) goto out; memcpy(&fsdev->nd_super, ssblock, sizeof(struct nandfs_super_block)); out: free(fsdatat, M_NANDFSTEMP); free(sblocks, M_NANDFSTEMP); if (error == 0) DPRINTF(VOLUMES, ("%s: selected sb with w_time %jd " "last_pseg %#jx\n", __func__, fsdev->nd_super.s_wtime, fsdev->nd_super.s_last_pseg)); return (error); } static void nandfs_unmount_base(struct nandfs_device *nandfsdev) { int error; if (!nandfsdev) return; /* Remove all our information */ error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0); if (error) { /* * Flushing buffers failed when fs was umounting, can't do * much now, just printf error and continue with umount. */ nandfs_error("%s(): error:%d when umounting FS\n", __func__, error); } /* Release the device's system nodes */ nandfs_release_system_nodes(nandfsdev); } static void nandfs_get_ncleanseg(struct nandfs_device *nandfsdev) { struct nandfs_seg_stat nss; nandfs_get_seg_stat(nandfsdev, &nss); nandfsdev->nd_clean_segs = nss.nss_ncleansegs; DPRINTF(VOLUMES, ("nandfs_mount: clean segs: %jx\n", (uintmax_t)nandfsdev->nd_clean_segs)); } static int nandfs_mount_base(struct nandfs_device *nandfsdev, struct mount *mp, struct nandfs_args *args) { uint32_t log_blocksize; int error; /* Flush out any old buffers remaining from a previous use. */ if ((error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0))) return (error); error = nandfs_read_structures(nandfsdev); if (error) { printf("nandfs: could not get valid filesystem structures\n"); return (error); } if (nandfsdev->nd_fsdata.f_rev_level != NANDFS_CURRENT_REV) { printf("nandfs: unsupported file system revision: %d " "(supported is %d).\n", nandfsdev->nd_fsdata.f_rev_level, NANDFS_CURRENT_REV); return (EINVAL); } if (nandfsdev->nd_fsdata.f_erasesize != nandfsdev->nd_erasesize) { printf("nandfs: erasesize mismatch (device %#x, fs %#x)\n", nandfsdev->nd_erasesize, nandfsdev->nd_fsdata.f_erasesize); return (EINVAL); } /* Get our blocksize */ log_blocksize = nandfsdev->nd_fsdata.f_log_block_size; nandfsdev->nd_blocksize = (uint64_t) 1 << (log_blocksize + 10); DPRINTF(VOLUMES, ("%s: blocksize:%x\n", __func__, nandfsdev->nd_blocksize)); DPRINTF(VOLUMES, ("%s: accepted super block with cp %#jx\n", __func__, (uintmax_t)nandfsdev->nd_super.s_last_cno)); /* Calculate dat structure parameters */ nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_dat_mdt, nandfsdev->nd_fsdata.f_dat_entry_size); nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_ifile_mdt, nandfsdev->nd_fsdata.f_inode_size); /* Search for the super root and roll forward when needed */ if (nandfs_search_super_root(nandfsdev)) { printf("Cannot find valid SuperRoot\n"); return (EINVAL); } nandfsdev->nd_mount_state = nandfsdev->nd_super.s_state; if (nandfsdev->nd_mount_state != NANDFS_VALID_FS) { printf("FS is seriously damaged, needs repairing\n"); printf("aborting mount\n"); return (EINVAL); } /* * FS should be ok now. The superblock and the last segsum could be * updated from the repair so extract running values again. */ nandfsdev->nd_last_pseg = nandfsdev->nd_super.s_last_pseg; nandfsdev->nd_seg_sequence = nandfsdev->nd_super.s_last_seq; nandfsdev->nd_seg_num = nandfs_get_segnum_of_block(nandfsdev, nandfsdev->nd_last_pseg); nandfsdev->nd_next_seg_num = nandfs_get_segnum_of_block(nandfsdev, nandfsdev->nd_last_segsum.ss_next); nandfsdev->nd_ts.tv_sec = nandfsdev->nd_last_segsum.ss_create; nandfsdev->nd_last_cno = nandfsdev->nd_super.s_last_cno; nandfsdev->nd_fakevblk = 1; nandfsdev->nd_last_ino = NANDFS_USER_INO; DPRINTF(VOLUMES, ("%s: last_pseg %#jx last_cno %#jx last_seq %#jx\n" "fsdev: last_seg: seq %#jx num %#jx, next_seg_num %#jx\n", __func__, (uintmax_t)nandfsdev->nd_last_pseg, (uintmax_t)nandfsdev->nd_last_cno, (uintmax_t)nandfsdev->nd_seg_sequence, (uintmax_t)nandfsdev->nd_seg_sequence, (uintmax_t)nandfsdev->nd_seg_num, (uintmax_t)nandfsdev->nd_next_seg_num)); DPRINTF(VOLUMES, ("nandfs_mount: accepted super root\n")); /* Create system vnodes for DAT, CP and SEGSUM */ error = nandfs_create_system_nodes(nandfsdev); if (error) nandfs_unmount_base(nandfsdev); nandfs_get_ncleanseg(nandfsdev); return (error); } static void nandfs_unmount_device(struct nandfs_device *nandfsdev) { /* Is there anything? */ if (nandfsdev == NULL) return; /* Remove the device only if we're the last reference */ nandfsdev->nd_refcnt--; if (nandfsdev->nd_refcnt >= 1) return; MPASS(nandfsdev->nd_syncer == NULL); MPASS(nandfsdev->nd_cleaner == NULL); MPASS(nandfsdev->nd_free_base == NULL); /* Unmount our base */ nandfs_unmount_base(nandfsdev); /* Remove from our device list */ SLIST_REMOVE(&nandfs_devices, nandfsdev, nandfs_device, nd_next_device); DROP_GIANT(); g_topology_lock(); g_vfs_close(nandfsdev->nd_gconsumer); g_topology_unlock(); PICKUP_GIANT(); DPRINTF(VOLUMES, ("closing device\n")); /* Clear our mount reference and release device node */ vrele(nandfsdev->nd_devvp); dev_rel(nandfsdev->nd_devvp->v_rdev); /* Free our device info */ cv_destroy(&nandfsdev->nd_sync_cv); mtx_destroy(&nandfsdev->nd_sync_mtx); cv_destroy(&nandfsdev->nd_clean_cv); mtx_destroy(&nandfsdev->nd_clean_mtx); mtx_destroy(&nandfsdev->nd_mutex); lockdestroy(&nandfsdev->nd_seg_const); free(nandfsdev, M_NANDFSMNT); } static int nandfs_check_mounts(struct nandfs_device *nandfsdev, struct mount *mp, struct nandfs_args *args) { struct nandfsmount *nmp; uint64_t last_cno; /* no double-mounting of the same checkpoint */ STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) { if (nmp->nm_mount_args.cpno == args->cpno) return (EBUSY); } /* Allow readonly mounts without questioning here */ if (mp->mnt_flag & MNT_RDONLY) return (0); /* Read/write mount */ STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) { /* Only one RW mount on this device! */ if ((nmp->nm_vfs_mountp->mnt_flag & MNT_RDONLY)==0) return (EROFS); /* RDONLY on last mountpoint is device busy */ last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno; if (nmp->nm_mount_args.cpno == last_cno) return (EBUSY); } /* OK for now */ return (0); } static int nandfs_mount_device(struct vnode *devvp, struct mount *mp, struct nandfs_args *args, struct nandfs_device **nandfsdev_p) { struct nandfs_device *nandfsdev; struct g_provider *pp; struct g_consumer *cp; struct cdev *dev; uint32_t erasesize; int error, size; int ronly; DPRINTF(VOLUMES, ("Mounting NANDFS device\n")); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; /* Look up device in our nandfs_mountpoints */ *nandfsdev_p = NULL; SLIST_FOREACH(nandfsdev, &nandfs_devices, nd_next_device) if (nandfsdev->nd_devvp == devvp) break; if (nandfsdev) { DPRINTF(VOLUMES, ("device already mounted\n")); error = nandfs_check_mounts(nandfsdev, mp, args); if (error) return error; nandfsdev->nd_refcnt++; *nandfsdev_p = nandfsdev; if (!ronly) { DROP_GIANT(); g_topology_lock(); error = g_access(nandfsdev->nd_gconsumer, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); } return (error); } vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); dev = devvp->v_rdev; dev_ref(dev); DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "nandfs", ronly ? 0 : 1); pp = g_dev_getprovider(dev); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0); if (error) { dev_rel(dev); return (error); } nandfsdev = malloc(sizeof(struct nandfs_device), M_NANDFSMNT, M_WAITOK | M_ZERO); /* Initialise */ nandfsdev->nd_refcnt = 1; nandfsdev->nd_devvp = devvp; nandfsdev->nd_syncing = 0; nandfsdev->nd_cleaning = 0; nandfsdev->nd_gconsumer = cp; cv_init(&nandfsdev->nd_sync_cv, "nandfssync"); mtx_init(&nandfsdev->nd_sync_mtx, "nffssyncmtx", NULL, MTX_DEF); cv_init(&nandfsdev->nd_clean_cv, "nandfsclean"); mtx_init(&nandfsdev->nd_clean_mtx, "nffscleanmtx", NULL, MTX_DEF); mtx_init(&nandfsdev->nd_mutex, "nandfsdev lock", NULL, MTX_DEF); lockinit(&nandfsdev->nd_seg_const, PVFS, "nffssegcon", VLKTIMEOUT, LK_CANRECURSE); STAILQ_INIT(&nandfsdev->nd_mounts); nandfsdev->nd_devsize = pp->mediasize; nandfsdev->nd_devblocksize = pp->sectorsize; size = sizeof(erasesize); error = g_io_getattr("NAND::blocksize", nandfsdev->nd_gconsumer, &size, &erasesize); if (error) { DPRINTF(VOLUMES, ("couldn't get erasesize: %d\n", error)); if (error == ENOIOCTL || error == EOPNOTSUPP) { /* * We conclude that this is not NAND storage */ nandfsdev->nd_erasesize = NANDFS_DEF_ERASESIZE; nandfsdev->nd_is_nand = 0; } else { DROP_GIANT(); g_topology_lock(); g_vfs_close(nandfsdev->nd_gconsumer); g_topology_unlock(); PICKUP_GIANT(); dev_rel(dev); free(nandfsdev, M_NANDFSMNT); return (error); } } else { nandfsdev->nd_erasesize = erasesize; nandfsdev->nd_is_nand = 1; } DPRINTF(VOLUMES, ("%s: erasesize %x\n", __func__, nandfsdev->nd_erasesize)); /* Register nandfs_device in list */ SLIST_INSERT_HEAD(&nandfs_devices, nandfsdev, nd_next_device); error = nandfs_mount_base(nandfsdev, mp, args); if (error) { /* Remove all our information */ nandfs_unmount_device(nandfsdev); return (EINVAL); } nandfsdev->nd_maxfilesize = nandfs_get_maxfilesize(nandfsdev); *nandfsdev_p = nandfsdev; DPRINTF(VOLUMES, ("NANDFS device mounted ok\n")); return (0); } static int nandfs_mount_checkpoint(struct nandfsmount *nmp) { struct nandfs_cpfile_header *cphdr; struct nandfs_checkpoint *cp; struct nandfs_inode ifile_inode; struct nandfs_node *cp_node; struct buf *bp; uint64_t ncp, nsn, cpno, fcpno, blocknr, last_cno; uint32_t off, dlen; int cp_per_block, error; cpno = nmp->nm_mount_args.cpno; if (cpno == 0) cpno = nmp->nm_nandfsdev->nd_super.s_last_cno; DPRINTF(VOLUMES, ("%s: trying to mount checkpoint number %"PRIu64"\n", __func__, cpno)); cp_node = nmp->nm_nandfsdev->nd_cp_node; VOP_LOCK(NTOV(cp_node), LK_SHARED); /* Get cpfile header from 1st block of cp file */ error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp); if (error) { brelse(bp); VOP_UNLOCK(NTOV(cp_node), 0); return (error); } cphdr = (struct nandfs_cpfile_header *) bp->b_data; ncp = cphdr->ch_ncheckpoints; nsn = cphdr->ch_nsnapshots; brelse(bp); DPRINTF(VOLUMES, ("mount_nandfs: checkpoint header read in\n")); DPRINTF(VOLUMES, ("\tNumber of checkpoints %"PRIu64"\n", ncp)); DPRINTF(VOLUMES, ("\tNumber of snapshots %"PRIu64"\n", nsn)); /* Read in our specified checkpoint */ dlen = nmp->nm_nandfsdev->nd_fsdata.f_checkpoint_size; cp_per_block = nmp->nm_nandfsdev->nd_blocksize / dlen; fcpno = cpno + NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET - 1; blocknr = fcpno / cp_per_block; off = (fcpno % cp_per_block) * dlen; error = nandfs_bread(cp_node, blocknr, NOCRED, 0, &bp); if (error) { brelse(bp); VOP_UNLOCK(NTOV(cp_node), 0); printf("mount_nandfs: couldn't read cp block %"PRIu64"\n", fcpno); return (EINVAL); } /* Needs to be a valid checkpoint */ cp = (struct nandfs_checkpoint *) ((uint8_t *) bp->b_data + off); if (cp->cp_flags & NANDFS_CHECKPOINT_INVALID) { printf("mount_nandfs: checkpoint marked invalid\n"); brelse(bp); VOP_UNLOCK(NTOV(cp_node), 0); return (EINVAL); } /* Is this really the checkpoint we want? */ if (cp->cp_cno != cpno) { printf("mount_nandfs: checkpoint file corrupt? " "expected cpno %"PRIu64", found cpno %"PRIu64"\n", cpno, cp->cp_cno); brelse(bp); VOP_UNLOCK(NTOV(cp_node), 0); return (EINVAL); } /* Check if it's a snapshot ! */ last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno; if (cpno != last_cno) { /* Only allow snapshots if not mounting on the last cp */ if ((cp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT) == 0) { printf( "mount_nandfs: checkpoint %"PRIu64" is not a " "snapshot\n", cpno); brelse(bp); VOP_UNLOCK(NTOV(cp_node), 0); return (EINVAL); } } ifile_inode = cp->cp_ifile_inode; brelse(bp); /* Get ifile inode */ error = nandfs_get_node_raw(nmp->nm_nandfsdev, NULL, NANDFS_IFILE_INO, &ifile_inode, &nmp->nm_ifile_node); if (error) { printf("mount_nandfs: can't read ifile node\n"); VOP_UNLOCK(NTOV(cp_node), 0); return (EINVAL); } NANDFS_SET_SYSTEMFILE(NTOV(nmp->nm_ifile_node)); VOP_UNLOCK(NTOV(cp_node), 0); /* Get root node? */ return (0); } static void free_nandfs_mountinfo(struct mount *mp) { struct nandfsmount *nmp = VFSTONANDFS(mp); if (nmp == NULL) return; free(nmp, M_NANDFSMNT); } void nandfs_wakeup_wait_sync(struct nandfs_device *nffsdev, int reason) { char *reasons[] = { "umount", "vfssync", "bdflush", "fforce", "fsync", "ro_upd" }; DPRINTF(SYNC, ("%s: %s\n", __func__, reasons[reason])); mtx_lock(&nffsdev->nd_sync_mtx); if (nffsdev->nd_syncing) cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx); if (reason == SYNCER_UMOUNT) nffsdev->nd_syncer_exit = 1; nffsdev->nd_syncing = 1; wakeup(&nffsdev->nd_syncing); cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx); mtx_unlock(&nffsdev->nd_sync_mtx); } static void nandfs_gc_finished(struct nandfs_device *nffsdev, int exit) { int error; mtx_lock(&nffsdev->nd_sync_mtx); nffsdev->nd_syncing = 0; DPRINTF(SYNC, ("%s: cleaner finish\n", __func__)); cv_broadcast(&nffsdev->nd_sync_cv); mtx_unlock(&nffsdev->nd_sync_mtx); if (!exit) { error = tsleep(&nffsdev->nd_syncing, PRIBIO, "-", hz * nandfs_sync_interval); DPRINTF(SYNC, ("%s: cleaner waked up: %d\n", __func__, error)); } } static void nandfs_syncer(struct nandfsmount *nmp) { struct nandfs_device *nffsdev; struct mount *mp; int flags, error; mp = nmp->nm_vfs_mountp; nffsdev = nmp->nm_nandfsdev; tsleep(&nffsdev->nd_syncing, PRIBIO, "-", hz * nandfs_sync_interval); while (!nffsdev->nd_syncer_exit) { DPRINTF(SYNC, ("%s: syncer run\n", __func__)); nffsdev->nd_syncing = 1; flags = (nmp->nm_flags & (NANDFS_FORCE_SYNCER | NANDFS_UMOUNT)); error = nandfs_segment_constructor(nmp, flags); if (error) nandfs_error("%s: error:%d when creating segments\n", __func__, error); nmp->nm_flags &= ~flags; nandfs_gc_finished(nffsdev, 0); } MPASS(nffsdev->nd_cleaner == NULL); error = nandfs_segment_constructor(nmp, NANDFS_FORCE_SYNCER | NANDFS_UMOUNT); if (error) nandfs_error("%s: error:%d when creating segments\n", __func__, error); nandfs_gc_finished(nffsdev, 1); nffsdev->nd_syncer = NULL; MPASS(nffsdev->nd_free_base == NULL); DPRINTF(SYNC, ("%s: exiting\n", __func__)); kthread_exit(); } static int start_syncer(struct nandfsmount *nmp) { int error; MPASS(nmp->nm_nandfsdev->nd_syncer == NULL); DPRINTF(SYNC, ("%s: start syncer\n", __func__)); nmp->nm_nandfsdev->nd_syncer_exit = 0; error = kthread_add((void(*)(void *))nandfs_syncer, nmp, NULL, &nmp->nm_nandfsdev->nd_syncer, 0, 0, "nandfs_syncer"); if (error) printf("nandfs: could not start syncer: %d\n", error); return (error); } static int stop_syncer(struct nandfsmount *nmp) { MPASS(nmp->nm_nandfsdev->nd_syncer != NULL); nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_UMOUNT); DPRINTF(SYNC, ("%s: stop syncer\n", __func__)); return (0); } /* * Mount null layer */ static int nandfs_mount(struct mount *mp) { struct nandfsmount *nmp; struct vnode *devvp; struct nameidata nd; struct vfsoptlist *opts; struct thread *td; char *from; int error = 0, flags; DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp)); td = curthread; opts = mp->mnt_optnew; if (vfs_filteropt(opts, nandfs_opts)) return (EINVAL); /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONANDFS(mp); if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) { return (error); } if (!(nmp->nm_ronly) && vfs_flagopt(opts, "ro", NULL, 0)) { vn_start_write(NULL, &mp, V_WAIT); error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); vn_finished_write(mp); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_ROUPD); error = vflush(mp, 0, flags, td); if (error) return (error); nandfs_stop_cleaner(nmp->nm_nandfsdev); stop_syncer(nmp); DROP_GIANT(); g_topology_lock(); g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); nmp->nm_ronly = 1; } else if ((nmp->nm_ronly) && !vfs_flagopt(opts, "ro", NULL, 0)) { /* * Don't allow read-write snapshots. */ if (nmp->nm_mount_args.cpno != 0) return (EROFS); /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ devvp = nmp->nm_nandfsdev->nd_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) { error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } } VOP_UNLOCK(devvp, 0); DROP_GIANT(); g_topology_lock(); error = g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); error = start_syncer(nmp); if (error == 0) error = nandfs_start_cleaner(nmp->nm_nandfsdev); if (error) { DROP_GIANT(); g_topology_lock(); g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); return (error); } nmp->nm_ronly = 0; } return (0); } from = vfs_getopts(opts, "from", &error); if (error) return (error); /* * Find device node */ NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, from, curthread); error = namei(&nd); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); devvp = nd.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* Check the access rights on the mount device */ error = VOP_ACCESS(devvp, VREAD, curthread->td_ucred, curthread); if (error) error = priv_check(curthread, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } vfs_getnewfsid(mp); error = nandfs_mountfs(devvp, mp); if (error) return (error); vfs_mountedfrom(mp, from); return (0); } static int nandfs_mountfs(struct vnode *devvp, struct mount *mp) { struct nandfsmount *nmp = NULL; struct nandfs_args *args = NULL; struct nandfs_device *nandfsdev; char *from; int error, ronly; char *cpno; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; VOP_UNLOCK(devvp, 0); if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; from = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) goto error; error = vfs_getopt(mp->mnt_optnew, "snap", (void **)&cpno, NULL); if (error == ENOENT) cpno = NULL; else if (error) goto error; args = (struct nandfs_args *)malloc(sizeof(struct nandfs_args), M_NANDFSMNT, M_WAITOK | M_ZERO); if (cpno != NULL) args->cpno = strtoul(cpno, (char **)NULL, 10); else args->cpno = 0; args->fspec = from; if (args->cpno != 0 && !ronly) { error = EROFS; goto error; } printf("WARNING: NANDFS is considered to be a highly experimental " "feature in FreeBSD.\n"); error = nandfs_mount_device(devvp, mp, args, &nandfsdev); if (error) goto error; nmp = (struct nandfsmount *) malloc(sizeof(struct nandfsmount), M_NANDFSMNT, M_WAITOK | M_ZERO); mp->mnt_data = nmp; nmp->nm_vfs_mountp = mp; nmp->nm_ronly = ronly; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; + mp->mnt_kern_flag |= MNTK_USES_BCACHE; MNT_IUNLOCK(mp); nmp->nm_nandfsdev = nandfsdev; /* Add our mountpoint */ STAILQ_INSERT_TAIL(&nandfsdev->nd_mounts, nmp, nm_next_mount); if (args->cpno > nandfsdev->nd_last_cno) { printf("WARNING: supplied checkpoint number (%jd) is greater " "than last known checkpoint on filesystem (%jd). Mounting" " checkpoint %jd\n", (uintmax_t)args->cpno, (uintmax_t)nandfsdev->nd_last_cno, (uintmax_t)nandfsdev->nd_last_cno); args->cpno = nandfsdev->nd_last_cno; } /* Setting up other parameters */ nmp->nm_mount_args = *args; free(args, M_NANDFSMNT); error = nandfs_mount_checkpoint(nmp); if (error) { nandfs_unmount(mp, MNT_FORCE); goto unmounted; } if (!ronly) { error = start_syncer(nmp); if (error == 0) error = nandfs_start_cleaner(nmp->nm_nandfsdev); if (error) nandfs_unmount(mp, MNT_FORCE); } return (0); error: if (args != NULL) free(args, M_NANDFSMNT); if (nmp != NULL) { free(nmp, M_NANDFSMNT); mp->mnt_data = NULL; } unmounted: return (error); } static int nandfs_unmount(struct mount *mp, int mntflags) { struct nandfs_device *nandfsdev; struct nandfsmount *nmp; int error; int flags = 0; DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp)); if (mntflags & MNT_FORCE) flags |= FORCECLOSE; nmp = mp->mnt_data; nandfsdev = nmp->nm_nandfsdev; error = vflush(mp, 0, flags | SKIPSYSTEM, curthread); if (error) return (error); if (!(nmp->nm_ronly)) { nandfs_stop_cleaner(nandfsdev); stop_syncer(nmp); } if (nmp->nm_ifile_node) NANDFS_UNSET_SYSTEMFILE(NTOV(nmp->nm_ifile_node)); /* Remove our mount point */ STAILQ_REMOVE(&nandfsdev->nd_mounts, nmp, nandfsmount, nm_next_mount); /* Unmount the device itself when we're the last one */ nandfs_unmount_device(nandfsdev); free_nandfs_mountinfo(mp); /* * Finally, throw away the null_mount structure */ mp->mnt_data = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (0); } static int nandfs_statfs(struct mount *mp, struct statfs *sbp) { struct nandfsmount *nmp; struct nandfs_device *nandfsdev; struct nandfs_fsdata *fsdata; struct nandfs_super_block *sb; struct nandfs_block_group_desc *groups; struct nandfs_node *ifile; struct nandfs_mdt *mdt; struct buf *bp; int i, error; uint32_t entries_per_group; uint64_t files = 0; nmp = mp->mnt_data; nandfsdev = nmp->nm_nandfsdev; fsdata = &nandfsdev->nd_fsdata; sb = &nandfsdev->nd_super; ifile = nmp->nm_ifile_node; mdt = &nandfsdev->nd_ifile_mdt; entries_per_group = mdt->entries_per_group; VOP_LOCK(NTOV(ifile), LK_SHARED); error = nandfs_bread(ifile, 0, NOCRED, 0, &bp); if (error) { brelse(bp); VOP_UNLOCK(NTOV(ifile), 0); return (error); } groups = (struct nandfs_block_group_desc *)bp->b_data; for (i = 0; i < mdt->groups_per_desc_block; i++) files += (entries_per_group - groups[i].bg_nfrees); brelse(bp); VOP_UNLOCK(NTOV(ifile), 0); sbp->f_bsize = nandfsdev->nd_blocksize; sbp->f_iosize = sbp->f_bsize; sbp->f_blocks = fsdata->f_blocks_per_segment * fsdata->f_nsegments; sbp->f_bfree = sb->s_free_blocks_count; sbp->f_bavail = sbp->f_bfree; sbp->f_files = files; sbp->f_ffree = 0; return (0); } static int nandfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct nandfsmount *nmp = VFSTONANDFS(mp); struct nandfs_node *node; int error; error = nandfs_get_node(nmp, NANDFS_ROOT_INO, &node); if (error) return (error); KASSERT(NTOV(node)->v_vflag & VV_ROOT, ("root_vp->v_vflag & VV_ROOT")); *vpp = NTOV(node); return (error); } static int nandfs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { struct nandfsmount *nmp = VFSTONANDFS(mp); struct nandfs_node *node; int error; error = nandfs_get_node(nmp, ino, &node); if (node) *vpp = NTOV(node); return (error); } static int nandfs_sync(struct mount *mp, int waitfor) { struct nandfsmount *nmp = VFSTONANDFS(mp); DPRINTF(SYNC, ("%s: mp %p waitfor %d\n", __func__, mp, waitfor)); /* * XXX: A hack to be removed soon */ if (waitfor == MNT_LAZY) return (0); if (waitfor == MNT_SUSPEND) return (0); nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_VFS_SYNC); return (0); } static struct vfsops nandfs_vfsops = { .vfs_init = nandfs_init, .vfs_mount = nandfs_mount, .vfs_root = nandfs_root, .vfs_statfs = nandfs_statfs, .vfs_uninit = nandfs_uninit, .vfs_unmount = nandfs_unmount, .vfs_vget = nandfs_vget, .vfs_sync = nandfs_sync, }; VFS_SET(nandfs_vfsops, nandfs, VFCF_LOOPBACK); Index: stable/10/sys/fs/nfsclient/nfs_clvfsops.c =================================================================== --- stable/10/sys/fs/nfsclient/nfs_clvfsops.c (revision 282269) +++ stable/10/sys/fs/nfsclient/nfs_clvfsops.c (revision 282270) @@ -1,1871 +1,1872 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from nfs_vfsops.c 8.12 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_nfsroot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(nfscl, "NFSv4 client"); extern int nfscl_ticks; extern struct timeval nfsboottime; extern struct nfsstats newnfsstats; extern int nfsrv_useacl; extern int nfscl_debuglevel; extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; extern struct mtx ncl_iod_mutex; NFSCLSTATEMUTEX; MALLOC_DEFINE(M_NEWNFSREQ, "newnfsclient_req", "New NFS request header"); MALLOC_DEFINE(M_NEWNFSMNT, "newnfsmnt", "New NFS mount struct"); SYSCTL_DECL(_vfs_nfs); static int nfs_ip_paranoia = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW, &nfs_ip_paranoia, 0, ""); static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_INITIAL_DELAY, downdelayinitial, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, ""); /* how long between console messages "nfs server foo not responding" */ static int nfs_tprintf_delay = NFS_TPRINTF_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY, downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); static int nfs_mountroot(struct mount *); static void nfs_sec_name(char *, int *); static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp, const char *, struct ucred *, struct thread *); static int mountnfs(struct nfs_args *, struct mount *, struct sockaddr *, char *, u_char *, int, u_char *, int, u_char *, int, struct vnode **, struct ucred *, struct thread *, int, int, int); static void nfs_getnlminfo(struct vnode *, uint8_t *, size_t *, struct sockaddr_storage *, int *, off_t *, struct timeval *); static vfs_mount_t nfs_mount; static vfs_cmount_t nfs_cmount; static vfs_unmount_t nfs_unmount; static vfs_root_t nfs_root; static vfs_statfs_t nfs_statfs; static vfs_sync_t nfs_sync; static vfs_sysctl_t nfs_sysctl; static vfs_purge_t nfs_purge; /* * nfs vfs operations. */ static struct vfsops nfs_vfsops = { .vfs_init = ncl_init, .vfs_mount = nfs_mount, .vfs_cmount = nfs_cmount, .vfs_root = nfs_root, .vfs_statfs = nfs_statfs, .vfs_sync = nfs_sync, .vfs_uninit = ncl_uninit, .vfs_unmount = nfs_unmount, .vfs_sysctl = nfs_sysctl, .vfs_purge = nfs_purge, }; VFS_SET(nfs_vfsops, nfs, VFCF_NETWORK | VFCF_SBDRY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfs, 1); MODULE_DEPEND(nfs, nfscommon, 1, 1, 1); MODULE_DEPEND(nfs, krpc, 1, 1, 1); MODULE_DEPEND(nfs, nfssvc, 1, 1, 1); MODULE_DEPEND(nfs, nfslock, 1, 1, 1); /* * This structure is now defined in sys/nfs/nfs_diskless.c so that it * can be shared by both NFS clients. It is declared here so that it * will be defined for kernels built without NFS_ROOT, although it * isn't used in that case. */ #if !defined(NFS_ROOT) && !defined(NFSCLIENT) struct nfs_diskless nfs_diskless = { { { 0 } } }; struct nfsv3_diskless nfsv3_diskless = { { { 0 } } }; int nfs_diskless_valid = 0; #endif SYSCTL_INT(_vfs_nfs, OID_AUTO, diskless_valid, CTLFLAG_RD, &nfs_diskless_valid, 0, "Has the diskless struct been filled correctly"); SYSCTL_STRING(_vfs_nfs, OID_AUTO, diskless_rootpath, CTLFLAG_RD, nfsv3_diskless.root_hostnam, 0, "Path to nfs root"); SYSCTL_OPAQUE(_vfs_nfs, OID_AUTO, diskless_rootaddr, CTLFLAG_RD, &nfsv3_diskless.root_saddr, sizeof(nfsv3_diskless.root_saddr), "%Ssockaddr_in", "Diskless root nfs address"); void newnfsargs_ntoh(struct nfs_args *); static int nfs_mountdiskless(char *, struct sockaddr_in *, struct nfs_args *, struct thread *, struct vnode **, struct mount *); static void nfs_convert_diskless(void); static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs); int newnfs_iosize(struct nfsmount *nmp) { int iosize, maxio; /* First, set the upper limit for iosize */ if (nmp->nm_flag & NFSMNT_NFSV4) { maxio = NFS_MAXBSIZE; } else if (nmp->nm_flag & NFSMNT_NFSV3) { if (nmp->nm_sotype == SOCK_DGRAM) maxio = NFS_MAXDGRAMDATA; else maxio = NFS_MAXBSIZE; } else { maxio = NFS_V2MAXDATA; } if (nmp->nm_rsize > maxio || nmp->nm_rsize == 0) nmp->nm_rsize = maxio; if (nmp->nm_rsize > MAXBSIZE) nmp->nm_rsize = MAXBSIZE; if (nmp->nm_readdirsize > maxio || nmp->nm_readdirsize == 0) nmp->nm_readdirsize = maxio; if (nmp->nm_readdirsize > nmp->nm_rsize) nmp->nm_readdirsize = nmp->nm_rsize; if (nmp->nm_wsize > maxio || nmp->nm_wsize == 0) nmp->nm_wsize = maxio; if (nmp->nm_wsize > MAXBSIZE) nmp->nm_wsize = MAXBSIZE; /* * Calculate the size used for io buffers. Use the larger * of the two sizes to minimise nfs requests but make sure * that it is at least one VM page to avoid wasting buffer * space. */ iosize = imax(nmp->nm_rsize, nmp->nm_wsize); iosize = imax(iosize, PAGE_SIZE); nmp->nm_mountp->mnt_stat.f_iosize = iosize; return (iosize); } static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs) { args->version = NFS_ARGSVERSION; args->addr = oargs->addr; args->addrlen = oargs->addrlen; args->sotype = oargs->sotype; args->proto = oargs->proto; args->fh = oargs->fh; args->fhsize = oargs->fhsize; args->flags = oargs->flags; args->wsize = oargs->wsize; args->rsize = oargs->rsize; args->readdirsize = oargs->readdirsize; args->timeo = oargs->timeo; args->retrans = oargs->retrans; args->readahead = oargs->readahead; args->hostname = oargs->hostname; } static void nfs_convert_diskless(void) { bcopy(&nfs_diskless.myif, &nfsv3_diskless.myif, sizeof(struct ifaliasreq)); bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway, sizeof(struct sockaddr_in)); nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args); if (nfsv3_diskless.root_args.flags & NFSMNT_NFSV3) { nfsv3_diskless.root_fhsize = NFSX_MYFH; bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_MYFH); } else { nfsv3_diskless.root_fhsize = NFSX_V2FH; bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH); } bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr, sizeof(struct sockaddr_in)); bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN); nfsv3_diskless.root_time = nfs_diskless.root_time; bcopy(nfs_diskless.my_hostnam, nfsv3_diskless.my_hostnam, MAXHOSTNAMELEN); nfs_diskless_valid = 3; } /* * nfs statfs call */ static int nfs_statfs(struct mount *mp, struct statfs *sbp) { struct vnode *vp; struct thread *td; struct nfsmount *nmp = VFSTONFS(mp); struct nfsvattr nfsva; struct nfsfsinfo fs; struct nfsstatfs sb; int error = 0, attrflag, gotfsinfo = 0, ret; struct nfsnode *np; td = curthread; error = vfs_busy(mp, MBF_NOWAIT); if (error) return (error); error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) { vfs_unbusy(mp); return (error); } vp = NFSTOV(np); mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) { mtx_unlock(&nmp->nm_mtx); error = nfsrpc_fsinfo(vp, &fs, td->td_ucred, td, &nfsva, &attrflag, NULL); if (!error) gotfsinfo = 1; } else mtx_unlock(&nmp->nm_mtx); if (!error) error = nfsrpc_statfs(vp, &sb, &fs, td->td_ucred, td, &nfsva, &attrflag, NULL); if (error != 0) NFSCL_DEBUG(2, "statfs=%d\n", error); if (attrflag == 0) { ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1, td->td_ucred, td, &nfsva, NULL, NULL); if (ret) { /* * Just set default values to get things going. */ NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr)); nfsva.na_vattr.va_type = VDIR; nfsva.na_vattr.va_mode = 0777; nfsva.na_vattr.va_nlink = 100; nfsva.na_vattr.va_uid = (uid_t)0; nfsva.na_vattr.va_gid = (gid_t)0; nfsva.na_vattr.va_fileid = 2; nfsva.na_vattr.va_gen = 1; nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE; nfsva.na_vattr.va_size = 512 * 1024; } } (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); if (!error) { mtx_lock(&nmp->nm_mtx); if (gotfsinfo || (nmp->nm_flag & NFSMNT_NFSV4)) nfscl_loadfsinfo(nmp, &fs); nfscl_loadsbinfo(nmp, &sb, sbp); sbp->f_iosize = newnfs_iosize(nmp); mtx_unlock(&nmp->nm_mtx); if (sbp != &mp->mnt_stat) { bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } strncpy(&sbp->f_fstypename[0], mp->mnt_vfc->vfc_name, MFSNAMELEN); } else if (NFS_ISV4(vp)) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); } vput(vp); vfs_unbusy(mp); return (error); } /* * nfs version 3 fsinfo rpc call */ int ncl_fsinfo(struct nfsmount *nmp, struct vnode *vp, struct ucred *cred, struct thread *td) { struct nfsfsinfo fs; struct nfsvattr nfsva; int error, attrflag; error = nfsrpc_fsinfo(vp, &fs, cred, td, &nfsva, &attrflag, NULL); if (!error) { if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); mtx_lock(&nmp->nm_mtx); nfscl_loadfsinfo(nmp, &fs); mtx_unlock(&nmp->nm_mtx); } return (error); } /* * Mount a remote root fs via. nfs. This depends on the info in the * nfs_diskless structure that has been filled in properly by some primary * bootstrap. * It goes something like this: * - do enough of "ifconfig" by calling ifioctl() so that the system * can talk to the server * - If nfs_diskless.mygateway is filled in, use that address as * a default gateway. * - build the rootfs mount point and call mountnfs() to do the rest. * * It is assumed to be safe to read, modify, and write the nfsv3_diskless * structure, as well as other global NFS client variables here, as * nfs_mountroot() will be called once in the boot before any other NFS * client activity occurs. */ static int nfs_mountroot(struct mount *mp) { struct thread *td = curthread; struct nfsv3_diskless *nd = &nfsv3_diskless; struct socket *so; struct vnode *vp; struct ifreq ir; int error; u_long l; char buf[128]; char *cp; #if defined(BOOTP_NFSROOT) && defined(BOOTP) bootpc_init(); /* use bootp to get nfs_diskless filled in */ #elif defined(NFS_ROOT) nfs_setup_diskless(); #endif if (nfs_diskless_valid == 0) return (-1); if (nfs_diskless_valid == 1) nfs_convert_diskless(); /* * XXX splnet, so networks will receive... */ splnet(); /* * Do enough of ifconfig(8) so that the critical net interface can * talk to the server. */ error = socreate(nd->myif.ifra_addr.sa_family, &so, nd->root_args.sotype, 0, td->td_ucred, td); if (error) panic("nfs_mountroot: socreate(%04x): %d", nd->myif.ifra_addr.sa_family, error); #if 0 /* XXX Bad idea */ /* * We might not have been told the right interface, so we pass * over the first ten interfaces of the same kind, until we get * one of them configured. */ for (i = strlen(nd->myif.ifra_name) - 1; nd->myif.ifra_name[i] >= '0' && nd->myif.ifra_name[i] <= '9'; nd->myif.ifra_name[i] ++) { error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if(!error) break; } #endif error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if (error) panic("nfs_mountroot: SIOCAIFADDR: %d", error); if ((cp = getenv("boot.netif.mtu")) != NULL) { ir.ifr_mtu = strtol(cp, NULL, 10); bcopy(nd->myif.ifra_name, ir.ifr_name, IFNAMSIZ); freeenv(cp); error = ifioctl(so, SIOCSIFMTU, (caddr_t)&ir, td); if (error) printf("nfs_mountroot: SIOCSIFMTU: %d", error); } soclose(so); /* * If the gateway field is filled in, set it as the default route. * Note that pxeboot will set a default route of 0 if the route * is not set by the DHCP server. Check also for a value of 0 * to avoid panicking inappropriately in that situation. */ if (nd->mygateway.sin_len != 0 && nd->mygateway.sin_addr.s_addr != 0) { struct sockaddr_in mask, sin; bzero((caddr_t)&mask, sizeof(mask)); sin = mask; sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); /* XXX MRT use table 0 for this sort of thing */ CURVNET_SET(TD_TO_VNET(td)); error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&sin, (struct sockaddr *)&nd->mygateway, (struct sockaddr *)&mask, RTF_UP | RTF_GATEWAY, NULL, RT_DEFAULT_FIB); CURVNET_RESTORE(); if (error) panic("nfs_mountroot: RTM_ADD: %d", error); } /* * Create the rootfs mount point. */ nd->root_args.fh = nd->root_fh; nd->root_args.fhsize = nd->root_fhsize; l = ntohl(nd->root_saddr.sin_addr.s_addr); snprintf(buf, sizeof(buf), "%ld.%ld.%ld.%ld:%s", (l >> 24) & 0xff, (l >> 16) & 0xff, (l >> 8) & 0xff, (l >> 0) & 0xff, nd->root_hostnam); printf("NFS ROOT: %s\n", buf); nd->root_args.hostname = buf; if ((error = nfs_mountdiskless(buf, &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) { return (error); } /* * This is not really an nfs issue, but it is much easier to * set hostname here and then let the "/etc/rc.xxx" files * mount the right /var based upon its preset value. */ mtx_lock(&prison0.pr_mtx); strlcpy(prison0.pr_hostname, nd->my_hostnam, sizeof(prison0.pr_hostname)); mtx_unlock(&prison0.pr_mtx); inittodr(ntohl(nd->root_time)); return (0); } /* * Internal version of mount system call for diskless setup. */ static int nfs_mountdiskless(char *path, struct sockaddr_in *sin, struct nfs_args *args, struct thread *td, struct vnode **vpp, struct mount *mp) { struct sockaddr *nam; int dirlen, error; char *dirpath; /* * Find the directory path in "path", which also has the server's * name/ip address in it. */ dirpath = strchr(path, ':'); if (dirpath != NULL) dirlen = strlen(++dirpath); else dirlen = 0; nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK); if ((error = mountnfs(args, mp, nam, path, NULL, 0, dirpath, dirlen, NULL, 0, vpp, td->td_ucred, td, NFS_DEFAULT_NAMETIMEO, NFS_DEFAULT_NEGNAMETIMEO, 0)) != 0) { printf("nfs_mountroot: mount %s on /: %d\n", path, error); return (error); } return (0); } static void nfs_sec_name(char *sec, int *flagsp) { if (!strcmp(sec, "krb5")) *flagsp |= NFSMNT_KERB; else if (!strcmp(sec, "krb5i")) *flagsp |= (NFSMNT_KERB | NFSMNT_INTEGRITY); else if (!strcmp(sec, "krb5p")) *flagsp |= (NFSMNT_KERB | NFSMNT_PRIVACY); } static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp, const char *hostname, struct ucred *cred, struct thread *td) { int s; int adjsock; char *p; s = splnet(); /* * Set read-only flag if requested; otherwise, clear it if this is * an update. If this is not an update, then either the read-only * flag is already clear, or this is a root mount and it was set * intentionally at some previous point. */ if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if (mp->mnt_flag & MNT_UPDATE) { MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } /* * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes * no sense in that context. Also, set up appropriate retransmit * and soft timeout behavior. */ if (argp->sotype == SOCK_STREAM) { nmp->nm_flag &= ~NFSMNT_NOCONN; nmp->nm_timeo = NFS_MAXTIMEO; if ((argp->flags & NFSMNT_NFSV4) != 0) nmp->nm_retry = INT_MAX; else nmp->nm_retry = NFS_RETRANS_TCP; } /* Also clear RDIRPLUS if NFSv2, it crashes some servers */ if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) { argp->flags &= ~NFSMNT_RDIRPLUS; nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } /* Re-bind if rsrvd port requested and wasn't on one */ adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT) && (argp->flags & NFSMNT_RESVPORT); /* Also re-bind if we're switching to/from a connected UDP socket */ adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) != (argp->flags & NFSMNT_NOCONN)); /* Update flags atomically. Don't change the lock bits. */ nmp->nm_flag = argp->flags | nmp->nm_flag; splx(s); if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; if (nmp->nm_timeo < NFS_MINTIMEO) nmp->nm_timeo = NFS_MINTIMEO; else if (nmp->nm_timeo > NFS_MAXTIMEO) nmp->nm_timeo = NFS_MAXTIMEO; } if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { nmp->nm_retry = argp->retrans; if (nmp->nm_retry > NFS_MAXREXMIT) nmp->nm_retry = NFS_MAXREXMIT; } if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { nmp->nm_wsize = argp->wsize; /* * Clip at the power of 2 below the size. There is an * issue (not isolated) that causes intermittent page * faults if this is not done. */ if (nmp->nm_wsize > NFS_FABLKSIZE) nmp->nm_wsize = 1 << (fls(nmp->nm_wsize) - 1); else nmp->nm_wsize = NFS_FABLKSIZE; } if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { nmp->nm_rsize = argp->rsize; /* * Clip at the power of 2 below the size. There is an * issue (not isolated) that causes intermittent page * faults if this is not done. */ if (nmp->nm_rsize > NFS_FABLKSIZE) nmp->nm_rsize = 1 << (fls(nmp->nm_rsize) - 1); else nmp->nm_rsize = NFS_FABLKSIZE; } if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { nmp->nm_readdirsize = argp->readdirsize; } if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) nmp->nm_acregmin = argp->acregmin; else nmp->nm_acregmin = NFS_MINATTRTIMO; if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) nmp->nm_acregmax = argp->acregmax; else nmp->nm_acregmax = NFS_MAXATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) nmp->nm_acdirmin = argp->acdirmin; else nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) nmp->nm_acdirmax = argp->acdirmax; else nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; if (nmp->nm_acdirmin > nmp->nm_acdirmax) nmp->nm_acdirmin = nmp->nm_acdirmax; if (nmp->nm_acregmin > nmp->nm_acregmax) nmp->nm_acregmin = nmp->nm_acregmax; if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) { if (argp->readahead <= NFS_MAXRAHEAD) nmp->nm_readahead = argp->readahead; else nmp->nm_readahead = NFS_MAXRAHEAD; } if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) { if (argp->wcommitsize < nmp->nm_wsize) nmp->nm_wcommitsize = nmp->nm_wsize; else nmp->nm_wcommitsize = argp->wcommitsize; } adjsock |= ((nmp->nm_sotype != argp->sotype) || (nmp->nm_soproto != argp->proto)); if (nmp->nm_client != NULL && adjsock) { int haslock = 0, error = 0; if (nmp->nm_sotype == SOCK_STREAM) { error = newnfs_sndlock(&nmp->nm_sockreq.nr_lock); if (!error) haslock = 1; } if (!error) { newnfs_disconnect(&nmp->nm_sockreq); if (haslock) newnfs_sndunlock(&nmp->nm_sockreq.nr_lock); nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; if (nmp->nm_sotype == SOCK_DGRAM) while (newnfs_connect(nmp, &nmp->nm_sockreq, cred, td, 0)) { printf("newnfs_args: retrying connect\n"); (void) nfs_catnap(PSOCK, 0, "newnfscon"); } } } else { nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; } if (hostname != NULL) { strlcpy(nmp->nm_hostname, hostname, sizeof(nmp->nm_hostname)); p = strchr(nmp->nm_hostname, ':'); if (p != NULL) *p = '\0'; } } static const char *nfs_opts[] = { "from", "nfs_args", "noac", "noatime", "noexec", "suiddir", "nosuid", "nosymfollow", "union", "noclusterr", "noclusterw", "multilabel", "acls", "force", "update", "async", "noconn", "nolockd", "conn", "lockd", "intr", "rdirplus", "readdirsize", "soft", "hard", "mntudp", "tcp", "udp", "wsize", "rsize", "retrans", "actimeo", "acregmin", "acregmax", "acdirmin", "acdirmax", "resvport", "readahead", "hostname", "timeo", "timeout", "addr", "fh", "nfsv3", "sec", "principal", "nfsv4", "gssname", "allgssname", "dirpath", "minorversion", "nametimeo", "negnametimeo", "nocto", "noncontigwr", "pnfs", "wcommitsize", NULL }; /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the sockargs() call because sockargs() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_mount(struct mount *mp) { struct nfs_args args = { .version = NFS_ARGSVERSION, .addr = NULL, .addrlen = sizeof (struct sockaddr_in), .sotype = SOCK_STREAM, .proto = 0, .fh = NULL, .fhsize = 0, .flags = NFSMNT_RESVPORT, .wsize = NFS_WSIZE, .rsize = NFS_RSIZE, .readdirsize = NFS_READDIRSIZE, .timeo = 10, .retrans = NFS_RETRANS, .readahead = NFS_DEFRAHEAD, .wcommitsize = 0, /* was: NQ_DEFLEASE */ .hostname = NULL, .acregmin = NFS_MINATTRTIMO, .acregmax = NFS_MAXATTRTIMO, .acdirmin = NFS_MINDIRATTRTIMO, .acdirmax = NFS_MAXDIRATTRTIMO, }; int error = 0, ret, len; struct sockaddr *nam = NULL; struct vnode *vp; struct thread *td; char hst[MNAMELEN]; u_char nfh[NFSX_FHMAX], krbname[100], dirpath[100], srvkrbname[100]; char *opt, *name, *secname; int nametimeo = NFS_DEFAULT_NAMETIMEO; int negnametimeo = NFS_DEFAULT_NEGNAMETIMEO; int minvers = 0; int dirlen, has_nfs_args_opt, krbnamelen, srvkrbnamelen; size_t hstlen; has_nfs_args_opt = 0; if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) { error = EINVAL; goto out; } td = curthread; if ((mp->mnt_flag & (MNT_ROOTFS | MNT_UPDATE)) == MNT_ROOTFS) { error = nfs_mountroot(mp); goto out; } nfscl_init(); /* * The old mount_nfs program passed the struct nfs_args * from userspace to kernel. The new mount_nfs program * passes string options via nmount() from userspace to kernel * and we populate the struct nfs_args in the kernel. */ if (vfs_getopt(mp->mnt_optnew, "nfs_args", NULL, NULL) == 0) { error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof(args)); if (error != 0) goto out; if (args.version != NFS_ARGSVERSION) { error = EPROGMISMATCH; goto out; } has_nfs_args_opt = 1; } /* Handle the new style options. */ if (vfs_getopt(mp->mnt_optnew, "noac", NULL, NULL) == 0) { args.acdirmin = args.acdirmax = args.acregmin = args.acregmax = 0; args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX | NFSMNT_ACREGMIN | NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "noconn", NULL, NULL) == 0) args.flags |= NFSMNT_NOCONN; if (vfs_getopt(mp->mnt_optnew, "conn", NULL, NULL) == 0) args.flags &= ~NFSMNT_NOCONN; if (vfs_getopt(mp->mnt_optnew, "nolockd", NULL, NULL) == 0) args.flags |= NFSMNT_NOLOCKD; if (vfs_getopt(mp->mnt_optnew, "lockd", NULL, NULL) == 0) args.flags &= ~NFSMNT_NOLOCKD; if (vfs_getopt(mp->mnt_optnew, "intr", NULL, NULL) == 0) args.flags |= NFSMNT_INT; if (vfs_getopt(mp->mnt_optnew, "rdirplus", NULL, NULL) == 0) args.flags |= NFSMNT_RDIRPLUS; if (vfs_getopt(mp->mnt_optnew, "resvport", NULL, NULL) == 0) args.flags |= NFSMNT_RESVPORT; if (vfs_getopt(mp->mnt_optnew, "noresvport", NULL, NULL) == 0) args.flags &= ~NFSMNT_RESVPORT; if (vfs_getopt(mp->mnt_optnew, "soft", NULL, NULL) == 0) args.flags |= NFSMNT_SOFT; if (vfs_getopt(mp->mnt_optnew, "hard", NULL, NULL) == 0) args.flags &= ~NFSMNT_SOFT; if (vfs_getopt(mp->mnt_optnew, "mntudp", NULL, NULL) == 0) args.sotype = SOCK_DGRAM; if (vfs_getopt(mp->mnt_optnew, "udp", NULL, NULL) == 0) args.sotype = SOCK_DGRAM; if (vfs_getopt(mp->mnt_optnew, "tcp", NULL, NULL) == 0) args.sotype = SOCK_STREAM; if (vfs_getopt(mp->mnt_optnew, "nfsv3", NULL, NULL) == 0) args.flags |= NFSMNT_NFSV3; if (vfs_getopt(mp->mnt_optnew, "nfsv4", NULL, NULL) == 0) { args.flags |= NFSMNT_NFSV4; args.sotype = SOCK_STREAM; } if (vfs_getopt(mp->mnt_optnew, "allgssname", NULL, NULL) == 0) args.flags |= NFSMNT_ALLGSSNAME; if (vfs_getopt(mp->mnt_optnew, "nocto", NULL, NULL) == 0) args.flags |= NFSMNT_NOCTO; if (vfs_getopt(mp->mnt_optnew, "noncontigwr", NULL, NULL) == 0) args.flags |= NFSMNT_NONCONTIGWR; if (vfs_getopt(mp->mnt_optnew, "pnfs", NULL, NULL) == 0) args.flags |= NFSMNT_PNFS; if (vfs_getopt(mp->mnt_optnew, "readdirsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal readdirsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.readdirsize); if (ret != 1 || args.readdirsize <= 0) { vfs_mount_error(mp, "illegal readdirsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_READDIRSIZE; } if (vfs_getopt(mp->mnt_optnew, "readahead", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal readahead"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.readahead); if (ret != 1 || args.readahead <= 0) { vfs_mount_error(mp, "illegal readahead: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_READAHEAD; } if (vfs_getopt(mp->mnt_optnew, "wsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal wsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.wsize); if (ret != 1 || args.wsize <= 0) { vfs_mount_error(mp, "illegal wsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_WSIZE; } if (vfs_getopt(mp->mnt_optnew, "rsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal rsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.rsize); if (ret != 1 || args.rsize <= 0) { vfs_mount_error(mp, "illegal wsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_RSIZE; } if (vfs_getopt(mp->mnt_optnew, "retrans", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal retrans"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.retrans); if (ret != 1 || args.retrans <= 0) { vfs_mount_error(mp, "illegal retrans: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_RETRANS; } if (vfs_getopt(mp->mnt_optnew, "actimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmin); if (ret != 1 || args.acregmin < 0) { vfs_mount_error(mp, "illegal actimeo: %s", opt); error = EINVAL; goto out; } args.acdirmin = args.acdirmax = args.acregmax = args.acregmin; args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX | NFSMNT_ACREGMIN | NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "acregmin", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmin); if (ret != 1 || args.acregmin < 0) { vfs_mount_error(mp, "illegal acregmin: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACREGMIN; } if (vfs_getopt(mp->mnt_optnew, "acregmax", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmax); if (ret != 1 || args.acregmax < 0) { vfs_mount_error(mp, "illegal acregmax: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "acdirmin", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acdirmin); if (ret != 1 || args.acdirmin < 0) { vfs_mount_error(mp, "illegal acdirmin: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACDIRMIN; } if (vfs_getopt(mp->mnt_optnew, "acdirmax", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acdirmax); if (ret != 1 || args.acdirmax < 0) { vfs_mount_error(mp, "illegal acdirmax: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACDIRMAX; } if (vfs_getopt(mp->mnt_optnew, "wcommitsize", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.wcommitsize); if (ret != 1 || args.wcommitsize < 0) { vfs_mount_error(mp, "illegal wcommitsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_WCOMMITSIZE; } if (vfs_getopt(mp->mnt_optnew, "timeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.timeo); if (ret != 1 || args.timeo <= 0) { vfs_mount_error(mp, "illegal timeo: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_TIMEO; } if (vfs_getopt(mp->mnt_optnew, "timeout", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.timeo); if (ret != 1 || args.timeo <= 0) { vfs_mount_error(mp, "illegal timeout: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_TIMEO; } if (vfs_getopt(mp->mnt_optnew, "nametimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &nametimeo); if (ret != 1 || nametimeo < 0) { vfs_mount_error(mp, "illegal nametimeo: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "negnametimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &negnametimeo); if (ret != 1 || negnametimeo < 0) { vfs_mount_error(mp, "illegal negnametimeo: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "minorversion", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &minvers); if (ret != 1 || minvers < 0 || minvers > 1 || (args.flags & NFSMNT_NFSV4) == 0) { vfs_mount_error(mp, "illegal minorversion: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "sec", (void **) &secname, NULL) == 0) nfs_sec_name(secname, &args.flags); if (mp->mnt_flag & MNT_UPDATE) { struct nfsmount *nmp = VFSTONFS(mp); if (nmp == NULL) { error = EIO; goto out; } /* * If a change from TCP->UDP is done and there are thread(s) * that have I/O RPC(s) in progress with a tranfer size * greater than NFS_MAXDGRAMDATA, those thread(s) will be * hung, retrying the RPC(s) forever. Usually these threads * will be seen doing an uninterruptible sleep on wait channel * "newnfsreq" (truncated to "newnfsre" by procstat). */ if (args.sotype == SOCK_DGRAM && nmp->nm_sotype == SOCK_STREAM) tprintf(td->td_proc, LOG_WARNING, "Warning: mount -u that changes TCP->UDP can result in hung threads\n"); /* * When doing an update, we can't change version, * security, switch lockd strategies or change cookie * translation */ args.flags = (args.flags & ~(NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)) | (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)); nfs_decode_args(mp, nmp, &args, NULL, td->td_ucred, td); goto out; } /* * Make the nfs_ip_paranoia sysctl serve as the default connection * or no-connection mode for those protocols that support * no-connection mode (the flag will be cleared later for protocols * that do not support no-connection mode). This will allow a client * to receive replies from a different IP then the request was * sent to. Note: default value for nfs_ip_paranoia is 1 (paranoid), * not 0. */ if (nfs_ip_paranoia == 0) args.flags |= NFSMNT_NOCONN; if (has_nfs_args_opt != 0) { /* * In the 'nfs_args' case, the pointers in the args * structure are in userland - we copy them in here. */ if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) { vfs_mount_error(mp, "Bad file handle"); error = EINVAL; goto out; } error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize); if (error != 0) goto out; error = copyinstr(args.hostname, hst, MNAMELEN - 1, &hstlen); if (error != 0) goto out; bzero(&hst[hstlen], MNAMELEN - hstlen); args.hostname = hst; /* sockargs() call must be after above copyin() calls */ error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen); if (error != 0) goto out; } else { if (vfs_getopt(mp->mnt_optnew, "fh", (void **)&args.fh, &args.fhsize) == 0) { if (args.fhsize < 0 || args.fhsize > NFSX_FHMAX) { vfs_mount_error(mp, "Bad file handle"); error = EINVAL; goto out; } bcopy(args.fh, nfh, args.fhsize); } else { args.fhsize = 0; } (void) vfs_getopt(mp->mnt_optnew, "hostname", (void **)&args.hostname, &len); if (args.hostname == NULL) { vfs_mount_error(mp, "Invalid hostname"); error = EINVAL; goto out; } bcopy(args.hostname, hst, MNAMELEN); hst[MNAMELEN - 1] = '\0'; } if (vfs_getopt(mp->mnt_optnew, "principal", (void **)&name, NULL) == 0) strlcpy(srvkrbname, name, sizeof (srvkrbname)); else snprintf(srvkrbname, sizeof (srvkrbname), "nfs@%s", hst); srvkrbnamelen = strlen(srvkrbname); if (vfs_getopt(mp->mnt_optnew, "gssname", (void **)&name, NULL) == 0) strlcpy(krbname, name, sizeof (krbname)); else krbname[0] = '\0'; krbnamelen = strlen(krbname); if (vfs_getopt(mp->mnt_optnew, "dirpath", (void **)&name, NULL) == 0) strlcpy(dirpath, name, sizeof (dirpath)); else dirpath[0] = '\0'; dirlen = strlen(dirpath); if (has_nfs_args_opt == 0) { if (vfs_getopt(mp->mnt_optnew, "addr", (void **)&args.addr, &args.addrlen) == 0) { if (args.addrlen > SOCK_MAXADDRLEN) { error = ENAMETOOLONG; goto out; } nam = malloc(args.addrlen, M_SONAME, M_WAITOK); bcopy(args.addr, nam, args.addrlen); nam->sa_len = args.addrlen; } else { vfs_mount_error(mp, "No server address"); error = EINVAL; goto out; } } args.fh = nfh; error = mountnfs(&args, mp, nam, hst, krbname, krbnamelen, dirpath, dirlen, srvkrbname, srvkrbnamelen, &vp, td->td_ucred, td, nametimeo, negnametimeo, minvers); out: if (!error) { MNT_ILOCK(mp); - mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_NO_IOPF; + mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_NO_IOPF | + MNTK_USES_BCACHE; MNT_IUNLOCK(mp); } return (error); } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the sockargs() call because sockargs() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { int error; struct nfs_args args; error = copyin(data, &args, sizeof (struct nfs_args)); if (error) return error; ma = mount_arg(ma, "nfs_args", &args, sizeof args); error = kernel_mount(ma, flags); return (error); } /* * Common code for mount and mountroot */ static int mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, char *hst, u_char *krbname, int krbnamelen, u_char *dirpath, int dirlen, u_char *srvkrbname, int srvkrbnamelen, struct vnode **vpp, struct ucred *cred, struct thread *td, int nametimeo, int negnametimeo, int minvers) { struct nfsmount *nmp; struct nfsnode *np; int error, trycnt, ret; struct nfsvattr nfsva; struct nfsclclient *clp; struct nfsclds *dsp, *tdsp; uint32_t lease; static u_int64_t clval = 0; NFSCL_DEBUG(3, "in mnt\n"); clp = NULL; if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); printf("%s: MNT_UPDATE is no longer handled here\n", __func__); FREE(nam, M_SONAME); return (0); } else { MALLOC(nmp, struct nfsmount *, sizeof (struct nfsmount) + krbnamelen + dirlen + srvkrbnamelen + 2, M_NEWNFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&nmp->nm_bufq); if (clval == 0) clval = (u_int64_t)nfsboottime.tv_sec; nmp->nm_clval = clval++; nmp->nm_krbnamelen = krbnamelen; nmp->nm_dirpathlen = dirlen; nmp->nm_srvkrbnamelen = srvkrbnamelen; if (td->td_ucred->cr_uid != (uid_t)0) { /* * nm_uid is used to get KerberosV credentials for * the nfsv4 state handling operations if there is * no host based principal set. Use the uid of * this user if not root, since they are doing the * mount. I don't think setting this for root will * work, since root normally does not have user * credentials in a credentials cache. */ nmp->nm_uid = td->td_ucred->cr_uid; } else { /* * Just set to -1, so it won't be used. */ nmp->nm_uid = (uid_t)-1; } /* Copy and null terminate all the names */ if (nmp->nm_krbnamelen > 0) { bcopy(krbname, nmp->nm_krbname, nmp->nm_krbnamelen); nmp->nm_name[nmp->nm_krbnamelen] = '\0'; } if (nmp->nm_dirpathlen > 0) { bcopy(dirpath, NFSMNT_DIRPATH(nmp), nmp->nm_dirpathlen); nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen + 1] = '\0'; } if (nmp->nm_srvkrbnamelen > 0) { bcopy(srvkrbname, NFSMNT_SRVKRBNAME(nmp), nmp->nm_srvkrbnamelen); nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen + nmp->nm_srvkrbnamelen + 2] = '\0'; } nmp->nm_sockreq.nr_cred = crhold(cred); mtx_init(&nmp->nm_sockreq.nr_mtx, "nfssock", NULL, MTX_DEF); mp->mnt_data = nmp; nmp->nm_getinfo = nfs_getnlminfo; nmp->nm_vinvalbuf = ncl_vinvalbuf; } vfs_getnewfsid(mp); nmp->nm_mountp = mp; mtx_init(&nmp->nm_mtx, "NFSmount lock", NULL, MTX_DEF | MTX_DUPOK); /* * Since nfs_decode_args() might optionally set them, these * need to be set to defaults before the call, so that the * optional settings aren't overwritten. */ nmp->nm_nametimeo = nametimeo; nmp->nm_negnametimeo = negnametimeo; nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; nmp->nm_readahead = NFS_DEFRAHEAD; if (desiredvnodes >= 11000) nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000); else nmp->nm_wcommitsize = hibufspace / 10; if ((argp->flags & NFSMNT_NFSV4) != 0) nmp->nm_minorvers = minvers; else nmp->nm_minorvers = 0; nfs_decode_args(mp, nmp, argp, hst, cred, td); /* * V2 can only handle 32 bit filesizes. A 4GB-1 limit may be too * high, depending on whether we end up with negative offsets in * the client or server somewhere. 2GB-1 may be safer. * * For V3, ncl_fsinfo will adjust this as necessary. Assume maximum * that we can handle until we find out otherwise. */ if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) nmp->nm_maxfilesize = 0xffffffffLL; else nmp->nm_maxfilesize = OFF_MAX; if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) { nmp->nm_wsize = NFS_WSIZE; nmp->nm_rsize = NFS_RSIZE; nmp->nm_readdirsize = NFS_READDIRSIZE; } nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_tprintf_delay = nfs_tprintf_delay; if (nmp->nm_tprintf_delay < 0) nmp->nm_tprintf_delay = 0; nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; nmp->nm_fhsize = argp->fhsize; if (nmp->nm_fhsize > 0) bcopy((caddr_t)argp->fh, (caddr_t)nmp->nm_fh, argp->fhsize); bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); nmp->nm_nam = nam; /* Set up the sockets and per-host congestion */ nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; nmp->nm_sockreq.nr_prog = NFS_PROG; if ((argp->flags & NFSMNT_NFSV4)) nmp->nm_sockreq.nr_vers = NFS_VER4; else if ((argp->flags & NFSMNT_NFSV3)) nmp->nm_sockreq.nr_vers = NFS_VER3; else nmp->nm_sockreq.nr_vers = NFS_VER2; if ((error = newnfs_connect(nmp, &nmp->nm_sockreq, cred, td, 0))) goto bad; /* For NFSv4.1, get the clientid now. */ if (nmp->nm_minorvers > 0) { NFSCL_DEBUG(3, "at getcl\n"); error = nfscl_getcl(mp, cred, td, 0, &clp); NFSCL_DEBUG(3, "aft getcl=%d\n", error); if (error != 0) goto bad; } if (nmp->nm_fhsize == 0 && (nmp->nm_flag & NFSMNT_NFSV4) && nmp->nm_dirpathlen > 0) { NFSCL_DEBUG(3, "in dirp\n"); /* * If the fhsize on the mount point == 0 for V4, the mount * path needs to be looked up. */ trycnt = 3; do { error = nfsrpc_getdirpath(nmp, NFSMNT_DIRPATH(nmp), cred, td); NFSCL_DEBUG(3, "aft dirp=%d\n", error); if (error) (void) nfs_catnap(PZERO, error, "nfsgetdirp"); } while (error && --trycnt > 0); if (error) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); goto bad; } } /* * A reference count is needed on the nfsnode representing the * remote root. If this object is not persistent, then backward * traversals of the mount point (i.e. "..") will not work if * the nfsnode gets flushed out of the cache. Ufs does not have * this problem, because one can identify root inodes by their * number == ROOTINO (2). */ if (nmp->nm_fhsize > 0) { /* * Set f_iosize to NFS_DIRBLKSIZ so that bo_bsize gets set * non-zero for the root vnode. f_iosize will be set correctly * by nfs_statfs() before any I/O occurs. */ mp->mnt_stat.f_iosize = NFS_DIRBLKSIZ; error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) goto bad; *vpp = NFSTOV(np); /* * Get file attributes and transfer parameters for the * mountpoint. This has the side effect of filling in * (*vpp)->v_type with the correct value. */ ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1, cred, td, &nfsva, NULL, &lease); if (ret) { /* * Just set default values to get things going. */ NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr)); nfsva.na_vattr.va_type = VDIR; nfsva.na_vattr.va_mode = 0777; nfsva.na_vattr.va_nlink = 100; nfsva.na_vattr.va_uid = (uid_t)0; nfsva.na_vattr.va_gid = (gid_t)0; nfsva.na_vattr.va_fileid = 2; nfsva.na_vattr.va_gen = 1; nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE; nfsva.na_vattr.va_size = 512 * 1024; lease = 60; } (void) nfscl_loadattrcache(vpp, &nfsva, NULL, NULL, 0, 1); if (nmp->nm_minorvers > 0) { NFSCL_DEBUG(3, "lease=%d\n", (int)lease); NFSLOCKCLSTATE(); clp->nfsc_renew = NFSCL_RENEW(lease); clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew; clp->nfsc_clientidrev++; if (clp->nfsc_clientidrev == 0) clp->nfsc_clientidrev++; NFSUNLOCKCLSTATE(); /* * Mount will succeed, so the renew thread can be * started now. */ nfscl_start_renewthread(clp); nfscl_clientrelease(clp); } if (argp->flags & NFSMNT_NFSV3) ncl_fsinfo(nmp, *vpp, cred, td); /* Mark if the mount point supports NFSv4 ACLs. */ if ((argp->flags & NFSMNT_NFSV4) != 0 && nfsrv_useacl != 0 && ret == 0 && NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL)) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * Lose the lock but keep the ref. */ NFSVOPUNLOCK(*vpp, 0); return (0); } error = EIO; bad: if (clp != NULL) nfscl_clientrelease(clp); newnfs_disconnect(&nmp->nm_sockreq); crfree(nmp->nm_sockreq.nr_cred); if (nmp->nm_sockreq.nr_auth != NULL) AUTH_DESTROY(nmp->nm_sockreq.nr_auth); mtx_destroy(&nmp->nm_sockreq.nr_mtx); mtx_destroy(&nmp->nm_mtx); if (nmp->nm_clp != NULL) { NFSLOCKCLSTATE(); LIST_REMOVE(nmp->nm_clp, nfsc_list); NFSUNLOCKCLSTATE(); free(nmp->nm_clp, M_NFSCLCLIENT); } TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) nfscl_freenfsclds(dsp); FREE(nmp, M_NEWNFSMNT); FREE(nam, M_SONAME); return (error); } /* * unmount system call */ static int nfs_unmount(struct mount *mp, int mntflags) { struct thread *td; struct nfsmount *nmp; int error, flags = 0, i, trycnt = 0; struct nfsclds *dsp, *tdsp; td = curthread; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; nmp = VFSTONFS(mp); /* * Goes something like this.. * - Call vflush() to clear out vnodes for this filesystem * - Close the socket * - Free up the data structures */ /* In the forced case, cancel any outstanding requests. */ if (mntflags & MNT_FORCE) { error = newnfs_nmcancelreqs(nmp); if (error) goto out; /* For a forced close, get rid of the renew thread now */ nfscl_umount(nmp, td); } /* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */ do { error = vflush(mp, 1, flags, td); if ((mntflags & MNT_FORCE) && error != 0 && ++trycnt < 30) (void) nfs_catnap(PSOCK, error, "newndm"); } while ((mntflags & MNT_FORCE) && error != 0 && trycnt < 30); if (error) goto out; /* * We are now committed to the unmount. */ if ((mntflags & MNT_FORCE) == 0) nfscl_umount(nmp, td); /* Make sure no nfsiods are assigned to this mount. */ mtx_lock(&ncl_iod_mutex); for (i = 0; i < NFS_MAXASYNCDAEMON; i++) if (ncl_iodmount[i] == nmp) { ncl_iodwant[i] = NFSIOD_AVAILABLE; ncl_iodmount[i] = NULL; } mtx_unlock(&ncl_iod_mutex); newnfs_disconnect(&nmp->nm_sockreq); crfree(nmp->nm_sockreq.nr_cred); FREE(nmp->nm_nam, M_SONAME); if (nmp->nm_sockreq.nr_auth != NULL) AUTH_DESTROY(nmp->nm_sockreq.nr_auth); mtx_destroy(&nmp->nm_sockreq.nr_mtx); mtx_destroy(&nmp->nm_mtx); TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) nfscl_freenfsclds(dsp); FREE(nmp, M_NEWNFSMNT); out: return (error); } /* * Return root of a filesystem */ static int nfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; struct nfsmount *nmp; struct nfsnode *np; int error; nmp = VFSTONFS(mp); error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, flags); if (error) return error; vp = NFSTOV(np); /* * Get transfer parameters and attributes for root vnode once. */ mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) { mtx_unlock(&nmp->nm_mtx); ncl_fsinfo(nmp, vp, curthread->td_ucred, curthread); } else mtx_unlock(&nmp->nm_mtx); if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Flush out the buffer cache */ /* ARGSUSED */ static int nfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *mvp; struct thread *td; int error, allerror = 0; td = curthread; MNT_ILOCK(mp); /* * If a forced dismount is in progress, return from here so that * the umount(2) syscall doesn't get stuck in VFS_SYNC() before * calling VFS_UNMOUNT(). */ if ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0) { MNT_IUNLOCK(mp); return (EBADF); } MNT_IUNLOCK(mp); /* * Force stale buffer cache information to be flushed. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* XXX Racy bv_cnt check. */ if (NFSVOPISLOCKED(vp) || vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY) { VI_UNLOCK(vp); continue; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; NFSVOPUNLOCK(vp, 0); vrele(vp); } return (allerror); } static int nfs_sysctl(struct mount *mp, fsctlop_t op, struct sysctl_req *req) { struct nfsmount *nmp = VFSTONFS(mp); struct vfsquery vq; int error; bzero(&vq, sizeof(vq)); switch (op) { #if 0 case VFS_CTL_NOLOCKS: val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0; if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &val, sizeof(val)); if (error) return (error); } if (req->newptr != NULL) { error = SYSCTL_IN(req, &val, sizeof(val)); if (error) return (error); if (val) nmp->nm_flag |= NFSMNT_NOLOCKS; else nmp->nm_flag &= ~NFSMNT_NOLOCKS; } break; #endif case VFS_CTL_QUERY: mtx_lock(&nmp->nm_mtx); if (nmp->nm_state & NFSSTA_TIMEO) vq.vq_flags |= VQ_NOTRESP; mtx_unlock(&nmp->nm_mtx); #if 0 if (!(nmp->nm_flag & NFSMNT_NOLOCKS) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) vq.vq_flags |= VQ_NOTRESPLOCK; #endif error = SYSCTL_OUT(req, &vq, sizeof(vq)); break; case VFS_CTL_TIMEO: if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); } if (req->newptr != NULL) { error = vfs_suser(mp, req->td); if (error) return (error); error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; } break; default: return (ENOTSUP); } return (0); } /* * Purge any RPCs in progress, so that they will all return errors. * This allows dounmount() to continue as far as VFS_UNMOUNT() for a * forced dismount. */ static void nfs_purge(struct mount *mp) { struct nfsmount *nmp = VFSTONFS(mp); newnfs_nmcancelreqs(nmp); } /* * Extract the information needed by the nlm from the nfs vnode. */ static void nfs_getnlminfo(struct vnode *vp, uint8_t *fhp, size_t *fhlenp, struct sockaddr_storage *sp, int *is_v3p, off_t *sizep, struct timeval *timeop) { struct nfsmount *nmp; struct nfsnode *np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); if (fhlenp != NULL) *fhlenp = (size_t)np->n_fhp->nfh_len; if (fhp != NULL) bcopy(np->n_fhp->nfh_fh, fhp, np->n_fhp->nfh_len); if (sp != NULL) bcopy(nmp->nm_nam, sp, min(nmp->nm_nam->sa_len, sizeof(*sp))); if (is_v3p != NULL) *is_v3p = NFS_ISV3(vp); if (sizep != NULL) *sizep = np->n_size; if (timeop != NULL) { timeop->tv_sec = nmp->nm_timeo / NFS_HZ; timeop->tv_usec = (nmp->nm_timeo % NFS_HZ) * (1000000 / NFS_HZ); } } /* * This function prints out an option name, based on the conditional * argument. */ static __inline void nfscl_printopt(struct nfsmount *nmp, int testval, char *opt, char **buf, size_t *blen) { int len; if (testval != 0 && *blen > strlen(opt)) { len = snprintf(*buf, *blen, "%s", opt); if (len != strlen(opt)) printf("EEK!!\n"); *buf += len; *blen -= len; } } /* * This function printf out an options integer value. */ static __inline void nfscl_printoptval(struct nfsmount *nmp, int optval, char *opt, char **buf, size_t *blen) { int len; if (*blen > strlen(opt) + 1) { /* Could result in truncated output string. */ len = snprintf(*buf, *blen, "%s=%d", opt, optval); if (len < *blen) { *buf += len; *blen -= len; } } } /* * Load the option flags and values into the buffer. */ void nfscl_retopts(struct nfsmount *nmp, char *buffer, size_t buflen) { char *buf; size_t blen; buf = buffer; blen = buflen; nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV4) != 0, "nfsv4", &buf, &blen); if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) { nfscl_printoptval(nmp, nmp->nm_minorvers, ",minorversion", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_PNFS) != 0, ",pnfs", &buf, &blen); } nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV3) != 0, "nfsv3", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0, "nfsv2", &buf, &blen); nfscl_printopt(nmp, nmp->nm_sotype == SOCK_STREAM, ",tcp", &buf, &blen); nfscl_printopt(nmp, nmp->nm_sotype != SOCK_STREAM, ",udp", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RESVPORT) != 0, ",resvport", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCONN) != 0, ",noconn", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) == 0, ",hard", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) != 0, ",soft", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_INT) != 0, ",intr", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) == 0, ",cto", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) != 0, ",nocto", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0, ",noncontigwr", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) == 0, ",lockd", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) == NFSMNT_NOLOCKD, ",nolockd", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RDIRPLUS) != 0, ",rdirplus", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_KERB) == 0, ",sec=sys", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == NFSMNT_KERB, ",sec=krb5", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_INTEGRITY), ",sec=krb5i", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_PRIVACY), ",sec=krb5p", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acdirmin, ",acdirmin", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acdirmax, ",acdirmax", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acregmin, ",acregmin", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acregmax, ",acregmax", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_nametimeo, ",nametimeo", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_negnametimeo, ",negnametimeo", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_rsize, ",rsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_wsize, ",wsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_readdirsize, ",readdirsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_readahead, ",readahead", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_wcommitsize, ",wcommitsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_timeo, ",timeout", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_retry, ",retrans", &buf, &blen); } Index: stable/10/sys/fs/nfsserver/nfs_nfsdport.c =================================================================== --- stable/10/sys/fs/nfsserver/nfs_nfsdport.c (revision 282269) +++ stable/10/sys/fs/nfsserver/nfs_nfsdport.c (revision 282270) @@ -1,3403 +1,3406 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include /* * Functions that perform the vfs operations required by the routines in * nfsd_serv.c. It is hoped that this change will make the server more * portable. */ #include #include #include #include #include FEATURE(nfsd, "NFSv4 server"); extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1; extern int nfsrv_useacl; extern int newnfs_numnfsd; extern struct mount nfsv4root_mnt; extern struct nfsrv_stablefirst nfsrv_stablefirst; extern void (*nfsd_call_servertimer)(void); extern SVCPOOL *nfsrvd_pool; extern struct nfsv4lock nfsd_suspend_lock; extern struct nfssessionhash nfssessionhash[NFSSESSIONHASHSIZE]; struct vfsoptlist nfsv4root_opt, nfsv4root_newopt; NFSDLOCKMUTEX; struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE]; struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE]; struct mtx nfsrc_udpmtx; struct mtx nfs_v4root_mutex; struct nfsrvfh nfs_rootfh, nfs_pubfh; int nfs_pubfhset = 0, nfs_rootfhset = 0; struct proc *nfsd_master_proc = NULL; int nfsd_debuglevel = 0; static pid_t nfsd_master_pid = (pid_t)-1; static char nfsd_master_comm[MAXCOMLEN + 1]; static struct timeval nfsd_master_start; static uint32_t nfsv4_sysid = 0; static int nfssvc_srvcall(struct thread *, struct nfssvc_args *, struct ucred *); int nfsrv_enable_crossmntpt = 1; static int nfs_commit_blks; static int nfs_commit_miss; extern int nfsrv_issuedelegs; extern int nfsrv_dolocallocks; extern int nfsd_enable_stringtouid; SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW, 0, "New NFS server"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW, &nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks, 0, ""); SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss, 0, ""); SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW, &nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW, &nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, debuglevel, CTLFLAG_RW, &nfsd_debuglevel, 0, "Debug level for new nfs server"); SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, CTLFLAG_RW, &nfsd_enable_stringtouid, 0, "Enable nfsd to accept numeric owner_names"); #define MAX_REORDERED_RPC 16 #define NUM_HEURISTIC 1031 #define NHUSE_INIT 64 #define NHUSE_INC 16 #define NHUSE_MAX 2048 static struct nfsheur { struct vnode *nh_vp; /* vp to match (unreferenced pointer) */ off_t nh_nextoff; /* next offset for sequential detection */ int nh_use; /* use count for selection */ int nh_seqcount; /* heuristic */ } nfsheur[NUM_HEURISTIC]; /* * Heuristic to detect sequential operation. */ static struct nfsheur * nfsrv_sequential_heuristic(struct uio *uio, struct vnode *vp) { struct nfsheur *nh; int hi, try; /* Locate best candidate. */ try = 32; hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC; nh = &nfsheur[hi]; while (try--) { if (nfsheur[hi].nh_vp == vp) { nh = &nfsheur[hi]; break; } if (nfsheur[hi].nh_use > 0) --nfsheur[hi].nh_use; hi = (hi + 1) % NUM_HEURISTIC; if (nfsheur[hi].nh_use < nh->nh_use) nh = &nfsheur[hi]; } /* Initialize hint if this is a new file. */ if (nh->nh_vp != vp) { nh->nh_vp = vp; nh->nh_nextoff = uio->uio_offset; nh->nh_use = NHUSE_INIT; if (uio->uio_offset == 0) nh->nh_seqcount = 4; else nh->nh_seqcount = 1; } /* Calculate heuristic. */ if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) || uio->uio_offset == nh->nh_nextoff) { /* See comments in vfs_vnops.c:sequential_heuristic(). */ nh->nh_seqcount += howmany(uio->uio_resid, 16384); if (nh->nh_seqcount > IO_SEQMAX) nh->nh_seqcount = IO_SEQMAX; } else if (qabs(uio->uio_offset - nh->nh_nextoff) <= MAX_REORDERED_RPC * imax(vp->v_mount->mnt_stat.f_iosize, uio->uio_resid)) { /* Probably a reordered RPC, leave seqcount alone. */ } else if (nh->nh_seqcount > 1) { nh->nh_seqcount /= 2; } else { nh->nh_seqcount = 0; } nh->nh_use += NHUSE_INC; if (nh->nh_use > NHUSE_MAX) nh->nh_use = NHUSE_MAX; return (nh); } /* * Get attributes into nfsvattr structure. */ int nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred, struct thread *p, int vpislocked) { int error, lockedit = 0; if (vpislocked == 0) { /* * When vpislocked == 0, the vnode is either exclusively * locked by this thread or not locked by this thread. * As such, shared lock it, if not exclusively locked. */ if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) { lockedit = 1; NFSVOPLOCK(vp, LK_SHARED | LK_RETRY); } } error = VOP_GETATTR(vp, &nvap->na_vattr, cred); if (lockedit != 0) NFSVOPUNLOCK(vp, 0); NFSEXITCODE(error); return (error); } /* * Get a file handle for a vnode. */ int nfsvno_getfh(struct vnode *vp, fhandle_t *fhp, struct thread *p) { int error; NFSBZERO((caddr_t)fhp, sizeof(fhandle_t)); fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fhp->fh_fid); NFSEXITCODE(error); return (error); } /* * Perform access checking for vnodes obtained from file handles that would * refer to files already opened by a Unix client. You cannot just use * vn_writechk() and VOP_ACCESSX() for two reasons. * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write * case. * 2 - The owner is to be given access irrespective of mode bits for some * operations, so that processes that chmod after opening a file don't * break. */ int nfsvno_accchk(struct vnode *vp, accmode_t accmode, struct ucred *cred, struct nfsexstuff *exp, struct thread *p, int override, int vpislocked, u_int32_t *supportedtypep) { struct vattr vattr; int error = 0, getret = 0; if (vpislocked == 0) { if (NFSVOPLOCK(vp, LK_SHARED) != 0) { error = EPERM; goto out; } } if (accmode & VWRITE) { /* Just vn_writechk() changed to check rdonly */ /* * Disallow write attempts on read-only file systems; * unless the file is a socket or a block or character * device resident on the file system. */ if (NFSVNO_EXRDONLY(exp) || (vp->v_mount->mnt_flag & MNT_RDONLY)) { switch (vp->v_type) { case VREG: case VDIR: case VLNK: error = EROFS; default: break; } } /* * If there's shared text associated with * the inode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp) && error == 0) error = ETXTBSY; } if (error != 0) { if (vpislocked == 0) NFSVOPUNLOCK(vp, 0); goto out; } /* * Should the override still be applied when ACLs are enabled? */ error = VOP_ACCESSX(vp, accmode, cred, p); if (error != 0 && (accmode & (VDELETE | VDELETE_CHILD))) { /* * Try again with VEXPLICIT_DENY, to see if the test for * deletion is supported. */ error = VOP_ACCESSX(vp, accmode | VEXPLICIT_DENY, cred, p); if (error == 0) { if (vp->v_type == VDIR) { accmode &= ~(VDELETE | VDELETE_CHILD); accmode |= VWRITE; error = VOP_ACCESSX(vp, accmode, cred, p); } else if (supportedtypep != NULL) { *supportedtypep &= ~NFSACCESS_DELETE; } } } /* * Allow certain operations for the owner (reads and writes * on files that are already open). */ if (override != NFSACCCHK_NOOVERRIDE && (error == EPERM || error == EACCES)) { if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT)) error = 0; else if (override & NFSACCCHK_ALLOWOWNER) { getret = VOP_GETATTR(vp, &vattr, cred); if (getret == 0 && cred->cr_uid == vattr.va_uid) error = 0; } } if (vpislocked == 0) NFSVOPUNLOCK(vp, 0); out: NFSEXITCODE(error); return (error); } /* * Set attribute(s) vnop. */ int nfsvno_setattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { int error; error = VOP_SETATTR(vp, &nvap->na_vattr, cred); NFSEXITCODE(error); return (error); } /* * Set up nameidata for a lookup() call and do it. */ int nfsvno_namei(struct nfsrv_descript *nd, struct nameidata *ndp, struct vnode *dp, int islocked, struct nfsexstuff *exp, struct thread *p, struct vnode **retdirp) { struct componentname *cnp = &ndp->ni_cnd; int i; struct iovec aiov; struct uio auio; int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen; int error = 0, crossmnt; char *cp; *retdirp = NULL; cnp->cn_nameptr = cnp->cn_pnbuf; ndp->ni_strictrelative = 0; /* * Extract and set starting directory. */ if (dp->v_type != VDIR) { if (islocked) vput(dp); else vrele(dp); nfsvno_relpathbuf(ndp); error = ENOTDIR; goto out1; } if (islocked) NFSVOPUNLOCK(dp, 0); VREF(dp); *retdirp = dp; if (NFSVNO_EXRDONLY(exp)) cnp->cn_flags |= RDONLY; ndp->ni_segflg = UIO_SYSSPACE; crossmnt = 1; if (nd->nd_flag & ND_PUBLOOKUP) { ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') { vrele(dp); /* * Check for degenerate pathnames here, since lookup() * panics on them. */ for (i = 1; i < ndp->ni_pathlen; i++) if (cnp->cn_pnbuf[i] != '/') break; if (i == ndp->ni_pathlen) { error = NFSERR_ACCES; goto out; } dp = rootvnode; VREF(dp); } } else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) || (nd->nd_flag & ND_NFSV4) == 0) { /* * Only cross mount points for NFSv4 when doing a * mount while traversing the file system above * the mount point, unless nfsrv_enable_crossmntpt is set. */ cnp->cn_flags |= NOCROSSMOUNT; crossmnt = 0; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * because lookup() will dereference ni_startdir. */ cnp->cn_thread = p; ndp->ni_startdir = dp; ndp->ni_rootdir = rootvnode; ndp->ni_topdir = NULL; if (!lockleaf) cnp->cn_flags |= LOCKLEAF; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ error = lookup(ndp); if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) nfsvno_relpathbuf(ndp); if (ndp->ni_vp && !lockleaf) NFSVOPUNLOCK(ndp->ni_vp, 0); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) NFSVOPUNLOCK(ndp->ni_dvp, 0); if (!(nd->nd_flag & ND_PUBLOOKUP)) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = NULL; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); badlink2: vrele(ndp->ni_dvp); vput(ndp->ni_vp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } if (!lockleaf) cnp->cn_flags &= ~LOCKLEAF; out: if (error) { nfsvno_relpathbuf(ndp); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } out1: NFSEXITCODE2(error, nd); return (error); } /* * Set up a pathname buffer and return a pointer to it and, optionally * set a hash pointer. */ void nfsvno_setpathbuf(struct nameidata *ndp, char **bufpp, u_long **hashpp) { struct componentname *cnp = &ndp->ni_cnd; cnp->cn_flags |= (NOMACCHECK | HASBUF); cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (hashpp != NULL) *hashpp = NULL; *bufpp = cnp->cn_pnbuf; } /* * Release the above path buffer, if not released by nfsvno_namei(). */ void nfsvno_relpathbuf(struct nameidata *ndp) { if ((ndp->ni_cnd.cn_flags & HASBUF) == 0) panic("nfsrelpath"); uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } /* * Readlink vnode op into an mbuf list. */ int nfsvno_readlink(struct vnode *vp, struct ucred *cred, struct thread *p, struct mbuf **mpp, struct mbuf **mpendp, int *lenp) { struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN]; struct iovec *ivp = iv; struct uio io, *uiop = &io; struct mbuf *mp, *mp2 = NULL, *mp3 = NULL; int i, len, tlen, error = 0; len = 0; i = 0; while (len < NFS_MAXPATHLEN) { NFSMGET(mp); MCLGET(mp, M_WAITOK); mp->m_len = NFSMSIZ(mp); if (len == 0) { mp3 = mp2 = mp; } else { mp2->m_next = mp; mp2 = mp; } if ((len + mp->m_len) > NFS_MAXPATHLEN) { mp->m_len = NFS_MAXPATHLEN - len; len = NFS_MAXPATHLEN; } else { len += mp->m_len; } ivp->iov_base = mtod(mp, caddr_t); ivp->iov_len = mp->m_len; i++; ivp++; } uiop->uio_iov = iv; uiop->uio_iovcnt = i; uiop->uio_offset = 0; uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; error = VOP_READLINK(vp, uiop, cred); if (error) { m_freem(mp3); *lenp = 0; goto out; } if (uiop->uio_resid > 0) { len -= uiop->uio_resid; tlen = NFSM_RNDUP(len); nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len); } *lenp = len; *mpp = mp3; *mpendp = mp; out: NFSEXITCODE(error); return (error); } /* * Read vnode op call into mbuf list. */ int nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred, struct thread *p, struct mbuf **mpp, struct mbuf **mpendp) { struct mbuf *m; int i; struct iovec *iv; struct iovec *iv2; int error = 0, len, left, siz, tlen, ioflag = 0; struct mbuf *m2 = NULL, *m3; struct uio io, *uiop = &io; struct nfsheur *nh; len = left = NFSM_RNDUP(cnt); m3 = NULL; /* * Generate the mbuf list with the uio_iov ref. to it. */ i = 0; while (left > 0) { NFSMGET(m); MCLGET(m, M_WAITOK); m->m_len = 0; siz = min(M_TRAILINGSPACE(m), left); left -= siz; i++; if (m3) m2->m_next = m; else m3 = m; m2 = m; } MALLOC(iv, struct iovec *, i * sizeof (struct iovec), M_TEMP, M_WAITOK); uiop->uio_iov = iv2 = iv; m = m3; left = len; i = 0; while (left > 0) { if (m == NULL) panic("nfsvno_read iov"); siz = min(M_TRAILINGSPACE(m), left); if (siz > 0) { iv->iov_base = mtod(m, caddr_t) + m->m_len; iv->iov_len = siz; m->m_len += siz; left -= siz; iv++; i++; } m = m->m_next; } uiop->uio_iovcnt = i; uiop->uio_offset = off; uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; nh = nfsrv_sequential_heuristic(uiop, vp); ioflag |= nh->nh_seqcount << IO_SEQSHIFT; error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred); FREE((caddr_t)iv2, M_TEMP); if (error) { m_freem(m3); *mpp = NULL; goto out; } nh->nh_nextoff = uiop->uio_offset; tlen = len - uiop->uio_resid; cnt = cnt < tlen ? cnt : tlen; tlen = NFSM_RNDUP(cnt); if (tlen == 0) { m_freem(m3); m3 = NULL; } else if (len != tlen || tlen != cnt) nfsrv_adj(m3, len - tlen, tlen - cnt); *mpp = m3; *mpendp = m2; out: NFSEXITCODE(error); return (error); } /* * Write vnode op from an mbuf list. */ int nfsvno_write(struct vnode *vp, off_t off, int retlen, int cnt, int stable, struct mbuf *mp, char *cp, struct ucred *cred, struct thread *p) { struct iovec *ivp; int i, len; struct iovec *iv; int ioflags, error; struct uio io, *uiop = &io; struct nfsheur *nh; MALLOC(ivp, struct iovec *, cnt * sizeof (struct iovec), M_TEMP, M_WAITOK); uiop->uio_iov = iv = ivp; uiop->uio_iovcnt = cnt; i = mtod(mp, caddr_t) + mp->m_len - cp; len = retlen; while (len > 0) { if (mp == NULL) panic("nfsvno_write"); if (i > 0) { i = min(i, len); ivp->iov_base = cp; ivp->iov_len = i; ivp++; len -= i; } mp = mp->m_next; if (mp) { i = mp->m_len; cp = mtod(mp, caddr_t); } } if (stable == NFSWRITE_UNSTABLE) ioflags = IO_NODELOCKED; else ioflags = (IO_SYNC | IO_NODELOCKED); uiop->uio_resid = retlen; uiop->uio_rw = UIO_WRITE; uiop->uio_segflg = UIO_SYSSPACE; NFSUIOPROC(uiop, p); uiop->uio_offset = off; nh = nfsrv_sequential_heuristic(uiop, vp); ioflags |= nh->nh_seqcount << IO_SEQSHIFT; error = VOP_WRITE(vp, uiop, ioflags, cred); if (error == 0) nh->nh_nextoff = uiop->uio_offset; FREE((caddr_t)iv, M_TEMP); NFSEXITCODE(error); return (error); } /* * Common code for creating a regular file (plus special files for V2). */ int nfsvno_createsub(struct nfsrv_descript *nd, struct nameidata *ndp, struct vnode **vpp, struct nfsvattr *nvap, int *exclusive_flagp, int32_t *cverf, NFSDEV_T rdev, struct thread *p, struct nfsexstuff *exp) { u_quad_t tempsize; int error; error = nd->nd_repstat; if (!error && ndp->ni_vp == NULL) { if (nvap->na_type == VREG || nvap->na_type == VSOCK) { vrele(ndp->ni_startdir); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); vput(ndp->ni_dvp); nfsvno_relpathbuf(ndp); if (!error) { if (*exclusive_flagp) { *exclusive_flagp = 0; NFSVNO_ATTRINIT(nvap); nvap->na_atime.tv_sec = cverf[0]; nvap->na_atime.tv_nsec = cverf[1]; error = VOP_SETATTR(ndp->ni_vp, &nvap->na_vattr, nd->nd_cred); } } /* * NFS V2 Only. nfsrvd_mknod() does this for V3. * (This implies, just get out on an error.) */ } else if (nvap->na_type == VCHR || nvap->na_type == VBLK || nvap->na_type == VFIFO) { if (nvap->na_type == VCHR && rdev == 0xffffffff) nvap->na_type = VFIFO; if (nvap->na_type != VFIFO && (error = priv_check_cred(nd->nd_cred, PRIV_VFS_MKNOD_DEV, 0))) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); goto out; } nvap->na_rdev = rdev; error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); vput(ndp->ni_dvp); nfsvno_relpathbuf(ndp); vrele(ndp->ni_startdir); if (error) goto out; } else { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); error = ENXIO; goto out; } *vpp = ndp->ni_vp; } else { /* * Handle cases where error is already set and/or * the file exists. * 1 - clean up the lookup * 2 - iff !error and na_size set, truncate it */ vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); *vpp = ndp->ni_vp; if (ndp->ni_dvp == *vpp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); if (!error && nvap->na_size != VNOVAL) { error = nfsvno_accchk(*vpp, VWRITE, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); if (!error) { tempsize = nvap->na_size; NFSVNO_ATTRINIT(nvap); nvap->na_size = tempsize; error = VOP_SETATTR(*vpp, &nvap->na_vattr, nd->nd_cred); } } if (error) vput(*vpp); } out: NFSEXITCODE(error); return (error); } /* * Do a mknod vnode op. */ int nfsvno_mknod(struct nameidata *ndp, struct nfsvattr *nvap, struct ucred *cred, struct thread *p) { int error = 0; enum vtype vtyp; vtyp = nvap->na_type; /* * Iff doesn't exist, create it. */ if (ndp->ni_vp) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); vrele(ndp->ni_vp); error = EEXIST; goto out; } if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); error = NFSERR_BADTYPE; goto out; } if (vtyp == VSOCK) { vrele(ndp->ni_startdir); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); vput(ndp->ni_dvp); nfsvno_relpathbuf(ndp); } else { if (nvap->na_type != VFIFO && (error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV, 0))) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vput(ndp->ni_dvp); goto out; } error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); vput(ndp->ni_dvp); nfsvno_relpathbuf(ndp); vrele(ndp->ni_startdir); /* * Since VOP_MKNOD returns the ni_vp, I can't * see any reason to do the lookup. */ } out: NFSEXITCODE(error); return (error); } /* * Mkdir vnode op. */ int nfsvno_mkdir(struct nameidata *ndp, struct nfsvattr *nvap, uid_t saved_uid, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { int error = 0; if (ndp->ni_vp != NULL) { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vrele(ndp->ni_vp); nfsvno_relpathbuf(ndp); error = EEXIST; goto out; } error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); vput(ndp->ni_dvp); nfsvno_relpathbuf(ndp); out: NFSEXITCODE(error); return (error); } /* * symlink vnode op. */ int nfsvno_symlink(struct nameidata *ndp, struct nfsvattr *nvap, char *pathcp, int pathlen, int not_v2, uid_t saved_uid, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { int error = 0; if (ndp->ni_vp) { vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vrele(ndp->ni_vp); error = EEXIST; goto out; } error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr, pathcp); vput(ndp->ni_dvp); vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); /* * Although FreeBSD still had the lookup code in * it for 7/current, there doesn't seem to be any * point, since VOP_SYMLINK() returns the ni_vp. * Just vput it for v2. */ if (!not_v2 && !error) vput(ndp->ni_vp); out: NFSEXITCODE(error); return (error); } /* * Parse symbolic link arguments. * This function has an ugly side effect. It will MALLOC() an area for * the symlink and set iov_base to point to it, only if it succeeds. * So, if it returns with uiop->uio_iov->iov_base != NULL, that must * be FREE'd later. */ int nfsvno_getsymlink(struct nfsrv_descript *nd, struct nfsvattr *nvap, struct thread *p, char **pathcpp, int *lenp) { u_int32_t *tl; char *pathcp = NULL; int error = 0, len; struct nfsv2_sattr *sp; *pathcpp = NULL; *lenp = 0; if ((nd->nd_flag & ND_NFSV3) && (error = nfsrv_sattr(nd, nvap, NULL, NULL, p))) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len > NFS_MAXPATHLEN || len <= 0) { error = EBADRPC; goto nfsmout; } MALLOC(pathcp, caddr_t, len + 1, M_TEMP, M_WAITOK); error = nfsrv_mtostr(nd, pathcp, len); if (error) goto nfsmout; if (nd->nd_flag & ND_NFSV2) { NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR); nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode); } *pathcpp = pathcp; *lenp = len; NFSEXITCODE2(0, nd); return (0); nfsmout: if (pathcp) free(pathcp, M_TEMP); NFSEXITCODE2(error, nd); return (error); } /* * Remove a non-directory object. */ int nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { struct vnode *vp; int error = 0; vp = ndp->ni_vp; if (vp->v_type == VDIR) error = NFSERR_ISDIR; else if (is_v4) error = nfsrv_checkremove(vp, 1, p); if (!error) error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd); if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vput(vp); if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0) nfsvno_relpathbuf(ndp); NFSEXITCODE(error); return (error); } /* * Remove a directory. */ int nfsvno_rmdirsub(struct nameidata *ndp, int is_v4, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { struct vnode *vp; int error = 0; vp = ndp->ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (ndp->ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) error = EBUSY; out: if (!error) error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd); if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); vput(vp); if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0) nfsvno_relpathbuf(ndp); NFSEXITCODE(error); return (error); } /* * Rename vnode op. */ int nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p) { struct vnode *fvp, *tvp, *tdvp; int error = 0; fvp = fromndp->ni_vp; if (ndstat) { vrele(fromndp->ni_dvp); vrele(fvp); error = ndstat; goto out1; } tdvp = tondp->ni_dvp; tvp = tondp->ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST; goto out; } if (tvp->v_type == VDIR && tvp->v_mountedhere) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } /* * A rename to '.' or '..' results in a prematurely * unlocked vnode on FreeBSD5, so I'm just going to fail that * here. */ if ((tondp->ni_cnd.cn_namelen == 1 && tondp->ni_cnd.cn_nameptr[0] == '.') || (tondp->ni_cnd.cn_namelen == 2 && tondp->ni_cnd.cn_nameptr[0] == '.' && tondp->ni_cnd.cn_nameptr[1] == '.')) { error = EINVAL; goto out; } } if (fvp->v_type == VDIR && fvp->v_mountedhere) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } if (fvp->v_mount != tdvp->v_mount) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV; goto out; } if (fvp == tdvp) { error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL; goto out; } if (fvp == tvp) { /* * If source and destination are the same, there is nothing to * do. Set error to -1 to indicate this. */ error = -1; goto out; } if (ndflag & ND_NFSV4) { if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) { error = nfsrv_checkremove(fvp, 0, p); NFSVOPUNLOCK(fvp, 0); } else error = EPERM; if (tvp && !error) error = nfsrv_checkremove(tvp, 1, p); } else { /* * For NFSv2 and NFSv3, try to get rid of the delegation, so * that the NFSv4 client won't be confused by the rename. * Since nfsd_recalldelegation() can only be called on an * unlocked vnode at this point and fvp is the file that will * still exist after the rename, just do fvp. */ nfsd_recalldelegation(fvp, p); } out: if (!error) { error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp, &fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp, &tondp->ni_cnd); } else { if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fromndp->ni_dvp); vrele(fvp); if (error == -1) error = 0; } vrele(tondp->ni_startdir); nfsvno_relpathbuf(tondp); out1: vrele(fromndp->ni_startdir); nfsvno_relpathbuf(fromndp); NFSEXITCODE(error); return (error); } /* * Link vnode op. */ int nfsvno_link(struct nameidata *ndp, struct vnode *vp, struct ucred *cred, struct thread *p, struct nfsexstuff *exp) { struct vnode *xp; int error = 0; xp = ndp->ni_vp; if (xp != NULL) { error = EEXIST; } else { xp = ndp->ni_dvp; if (vp->v_mount != xp->v_mount) error = EXDEV; } if (!error) { NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) == 0) error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd); else error = EPERM; if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); NFSVOPUNLOCK(vp, 0); } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); if (ndp->ni_vp) vrele(ndp->ni_vp); } nfsvno_relpathbuf(ndp); NFSEXITCODE(error); return (error); } /* * Do the fsync() appropriate for the commit. */ int nfsvno_fsync(struct vnode *vp, u_int64_t off, int cnt, struct ucred *cred, struct thread *td) { int error = 0; /* * RFC 1813 3.3.21: if count is 0, a flush from offset to the end of * file is done. At this time VOP_FSYNC does not accept offset and * byte count parameters so call VOP_FSYNC the whole file for now. * The same is true for NFSv4: RFC 3530 Sec. 14.2.3. + * File systems that do not use the buffer cache (as indicated + * by MNTK_USES_BCACHE not being set) must use VOP_FSYNC(). */ - if (cnt == 0 || cnt > MAX_COMMIT_COUNT) { + if (cnt == 0 || cnt > MAX_COMMIT_COUNT || + (vp->v_mount->mnt_kern_flag & MNTK_USES_BCACHE) == 0) { /* * Give up and do the whole thing */ if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); } else { /* * Locate and synchronously write any buffers that fall * into the requested range. Note: we are assuming that * f_iosize is a power of 2. */ int iosize = vp->v_mount->mnt_stat.f_iosize; int iomask = iosize - 1; struct bufobj *bo; daddr_t lblkno; /* * Align to iosize boundry, super-align to page boundry. */ if (off & iomask) { cnt += off & iomask; off &= ~(u_quad_t)iomask; } if (off & PAGE_MASK) { cnt += off & PAGE_MASK; off &= ~(u_quad_t)PAGE_MASK; } lblkno = off / iosize; if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, off, off + cnt, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_object); } bo = &vp->v_bufobj; BO_LOCK(bo); while (cnt > 0) { struct buf *bp; /* * If we have a buffer and it is marked B_DELWRI we * have to lock and write it. Otherwise the prior * write is assumed to have already been committed. * * gbincore() can return invalid buffers now so we * have to check that bit as well (though B_DELWRI * should not be set if B_INVAL is set there could be * a race here since we haven't locked the buffer). */ if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); continue; /* retry */ } if ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI) { bremfree(bp); bp->b_flags &= ~B_ASYNC; bwrite(bp); ++nfs_commit_miss; } else BUF_UNLOCK(bp); BO_LOCK(bo); } ++nfs_commit_blks; if (cnt < iosize) break; cnt -= iosize; ++lblkno; } BO_UNLOCK(bo); } NFSEXITCODE(error); return (error); } /* * Statfs vnode op. */ int nfsvno_statfs(struct vnode *vp, struct statfs *sf) { int error; error = VFS_STATFS(vp->v_mount, sf); if (error == 0) { /* * Since NFS handles these values as unsigned on the * wire, there is no way to represent negative values, * so set them to 0. Without this, they will appear * to be very large positive values for clients like * Solaris10. */ if (sf->f_bavail < 0) sf->f_bavail = 0; if (sf->f_ffree < 0) sf->f_ffree = 0; } NFSEXITCODE(error); return (error); } /* * Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but * must handle nfsrv_opencheck() calls after any other access checks. */ void nfsvno_open(struct nfsrv_descript *nd, struct nameidata *ndp, nfsquad_t clientid, nfsv4stateid_t *stateidp, struct nfsstate *stp, int *exclusive_flagp, struct nfsvattr *nvap, int32_t *cverf, int create, NFSACL_T *aclp, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p, struct nfsexstuff *exp, struct vnode **vpp) { struct vnode *vp = NULL; u_quad_t tempsize; struct nfsexstuff nes; if (ndp->ni_vp == NULL) nd->nd_repstat = nfsrv_opencheck(clientid, stateidp, stp, NULL, nd, p, nd->nd_repstat); if (!nd->nd_repstat) { if (ndp->ni_vp == NULL) { vrele(ndp->ni_startdir); nd->nd_repstat = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr); vput(ndp->ni_dvp); nfsvno_relpathbuf(ndp); if (!nd->nd_repstat) { if (*exclusive_flagp) { *exclusive_flagp = 0; NFSVNO_ATTRINIT(nvap); nvap->na_atime.tv_sec = cverf[0]; nvap->na_atime.tv_nsec = cverf[1]; nd->nd_repstat = VOP_SETATTR(ndp->ni_vp, &nvap->na_vattr, cred); } else { nfsrv_fixattr(nd, ndp->ni_vp, nvap, aclp, p, attrbitp, exp); } } vp = ndp->ni_vp; } else { if (ndp->ni_startdir) vrele(ndp->ni_startdir); nfsvno_relpathbuf(ndp); vp = ndp->ni_vp; if (create == NFSV4OPEN_CREATE) { if (ndp->ni_dvp == vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); } if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) { if (ndp->ni_cnd.cn_flags & RDONLY) NFSVNO_SETEXRDONLY(&nes); else NFSVNO_EXINIT(&nes); nd->nd_repstat = nfsvno_accchk(vp, VWRITE, cred, &nes, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); nd->nd_repstat = nfsrv_opencheck(clientid, stateidp, stp, vp, nd, p, nd->nd_repstat); if (!nd->nd_repstat) { tempsize = nvap->na_size; NFSVNO_ATTRINIT(nvap); nvap->na_size = tempsize; nd->nd_repstat = VOP_SETATTR(vp, &nvap->na_vattr, cred); } } else if (vp->v_type == VREG) { nd->nd_repstat = nfsrv_opencheck(clientid, stateidp, stp, vp, nd, p, nd->nd_repstat); } } } else { if (ndp->ni_cnd.cn_flags & HASBUF) nfsvno_relpathbuf(ndp); if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) { vrele(ndp->ni_startdir); if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); if (ndp->ni_vp) vput(ndp->ni_vp); } } *vpp = vp; NFSEXITCODE2(0, nd); } /* * Updates the file rev and sets the mtime and ctime * to the current clock time, returning the va_filerev and va_Xtime * values. * Return ESTALE to indicate the vnode is VI_DOOMED. */ int nfsvno_updfilerev(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred, struct thread *p) { struct vattr va; VATTR_NULL(&va); vfs_timestamp(&va.va_mtime); if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) { NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) != 0) return (ESTALE); } (void) VOP_SETATTR(vp, &va, cred); (void) nfsvno_getattr(vp, nvap, cred, p, 1); return (0); } /* * Glue routine to nfsv4_fillattr(). */ int nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp, struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p, int isdgram, int reterr, int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno) { int error; error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror, attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root, mounted_on_fileno); NFSEXITCODE2(0, nd); return (error); } /* Since the Readdir vnode ops vary, put the entire functions in here. */ /* * nfs readdir service * - mallocs what it thinks is enough to read * count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR * - calls VOP_READDIR() * - loops around building the reply * if the output generated exceeds count break out of loop * The NFSM_CLGET macro is used here so that the reply will be packed * tightly in mbuf clusters. * - it trims out records with d_fileno == 0 * this doesn't matter for Unix clients, but they might confuse clients * for other os'. * - it trims out records with d_type == DT_WHT * these cannot be seen through NFS (unless we extend the protocol) * The alternate call nfsrvd_readdirplus() does lookups as well. * PS: The NFS protocol spec. does not clarify what the "count" byte * argument is a count of.. just name strings and file id's or the * entire reply rpc or ... * I tried just file name and id sizes and it confused the Sun client, * so I am using the full rpc size now. The "paranoia.." comment refers * to including the status longwords that are not a part of the dir. * "entry" structures, but are in the rpc. */ int nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram, struct vnode *vp, struct thread *p, struct nfsexstuff *exp) { struct dirent *dp; u_int32_t *tl; int dirlen; char *cpos, *cend, *rbuf; struct nfsvattr at; int nlen, error = 0, getret = 1; int siz, cnt, fullsiz, eofflag, ncookies; u_int64_t off, toff, verf; u_long *cookies = NULL, *cookiep; struct uio io; struct iovec iv; int is_ufs; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); goto out; } if (nd->nd_flag & ND_NFSV2) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); off = fxdr_unsigned(u_quad_t, *tl++); } else { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 2; verf = fxdr_hyper(tl); tl += 2; } toff = off; cnt = fxdr_unsigned(int, *tl); if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0) cnt = NFS_SRVMAXDATA(nd); siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); fullsiz = siz; if (nd->nd_flag & ND_NFSV3) { nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1); #if 0 /* * va_filerev is not sufficient as a cookie verifier, * since it is not supposed to change when entries are * removed/added unless that offset cookies returned to * the client are no longer valid. */ if (!nd->nd_repstat && toff && verf != at.na_filerev) nd->nd_repstat = NFSERR_BAD_COOKIE; #endif } if (!nd->nd_repstat && vp->v_type != VDIR) nd->nd_repstat = NFSERR_NOTDIR; if (nd->nd_repstat == 0 && cnt == 0) { if (nd->nd_flag & ND_NFSV2) /* NFSv2 does not have NFSERR_TOOSMALL */ nd->nd_repstat = EPERM; else nd->nd_repstat = NFSERR_TOOSMALL; } if (!nd->nd_repstat) nd->nd_repstat = nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0; MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); again: eofflag = 0; if (cookies) { free((caddr_t)cookies, M_TEMP); cookies = NULL; } iv.iov_base = rbuf; iv.iov_len = siz; io.uio_iov = &iv; io.uio_iovcnt = 1; io.uio_offset = (off_t)off; io.uio_resid = siz; io.uio_segflg = UIO_SYSSPACE; io.uio_rw = UIO_READ; io.uio_td = NULL; nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies, &cookies); off = (u_int64_t)io.uio_offset; if (io.uio_resid) siz -= io.uio_resid; if (!cookies && !nd->nd_repstat) nd->nd_repstat = NFSERR_PERM; if (nd->nd_flag & ND_NFSV3) { getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1); if (!nd->nd_repstat) nd->nd_repstat = getret; } /* * Handles the failed cases. nd->nd_repstat == 0 past here. */ if (nd->nd_repstat) { vput(vp); free((caddr_t)rbuf, M_TEMP); if (cookies) free((caddr_t)cookies, M_TEMP); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } /* * If nothing read, return eof * rpc reply */ if (siz == 0) { vput(vp); if (nd->nd_flag & ND_NFSV2) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); } else { nfsrv_postopattr(nd, getret, &at); NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); txdr_hyper(at.na_filerev, tl); tl += 2; } *tl++ = newnfs_false; *tl = newnfs_true; FREE((caddr_t)rbuf, M_TEMP); FREE((caddr_t)cookies, M_TEMP); goto out; } /* * Check for degenerate cases of nothing useful read. * If so go try again */ cpos = rbuf; cend = rbuf + siz; dp = (struct dirent *)cpos; cookiep = cookies; /* * For some reason FreeBSD's ufs_readdir() chooses to back the * directory offset up to a block boundary, so it is necessary to * skip over the records that precede the requested offset. This * requires the assumption that file offset cookies monotonically * increase. */ while (cpos < cend && ncookies > 0 && (dp->d_fileno == 0 || dp->d_type == DT_WHT || (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff))) { cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos >= cend || ncookies == 0) { siz = fullsiz; toff = off; goto again; } vput(vp); /* * dirlen is the size of the reply, including all XDR and must * not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate * if the XDR should be included in "count", but to be safe, we do. * (Include the two booleans at the end of the reply in dirlen now.) */ if (nd->nd_flag & ND_NFSV3) { nfsrv_postopattr(nd, getret, &at); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); txdr_hyper(at.na_filerev, tl); dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED; } else { dirlen = 2 * NFSX_UNSIGNED; } /* Loop through the records and build reply */ while (cpos < cend && ncookies > 0) { nlen = dp->d_namlen; if (dp->d_fileno != 0 && dp->d_type != DT_WHT && nlen <= NFS_MAXNAMLEN) { if (nd->nd_flag & ND_NFSV3) dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen)); else dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen)); if (dirlen > cnt) { eofflag = 0; break; } /* * Build the directory record xdr from * the dirent entry. */ if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl++ = 0; } else { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_true; } *tl = txdr_unsigned(dp->d_fileno); (void) nfsm_strtom(nd, dp->d_name, nlen); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = 0; } else NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(*cookiep); } cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos < cend) eofflag = 0; NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_false; if (eofflag) *tl = newnfs_true; else *tl = newnfs_false; FREE((caddr_t)rbuf, M_TEMP); FREE((caddr_t)cookies, M_TEMP); out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * Readdirplus for V3 and Readdir for V4. */ int nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram, struct vnode *vp, struct thread *p, struct nfsexstuff *exp) { struct dirent *dp; u_int32_t *tl; int dirlen; char *cpos, *cend, *rbuf; struct vnode *nvp; fhandle_t nfh; struct nfsvattr nva, at, *nvap = &nva; struct mbuf *mb0, *mb1; struct nfsreferral *refp; int nlen, r, error = 0, getret = 1, usevget = 1; int siz, cnt, fullsiz, eofflag, ncookies, entrycnt; caddr_t bpos0, bpos1; u_int64_t off, toff, verf; u_long *cookies = NULL, *cookiep; nfsattrbit_t attrbits, rderrbits, savbits; struct uio io; struct iovec iv; struct componentname cn; int at_root, is_ufs, is_zfs, needs_unbusy, supports_nfsv4acls; struct mount *mp, *new_mp; uint64_t mounted_on_fileno; if (nd->nd_repstat) { nfsrv_postopattr(nd, getret, &at); goto out; } NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED); off = fxdr_hyper(tl); toff = off; tl += 2; verf = fxdr_hyper(tl); tl += 2; siz = fxdr_unsigned(int, *tl++); cnt = fxdr_unsigned(int, *tl); /* * Use the server's maximum data transfer size as the upper bound * on reply datalen. */ if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0) cnt = NFS_SRVMAXDATA(nd); /* * siz is a "hint" of how much directory information (name, fileid, * cookie) should be in the reply. At least one client "hints" 0, * so I set it to cnt for that case. I also round it up to the * next multiple of DIRBLKSIZ. */ if (siz <= 0) siz = cnt; siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1)); if (nd->nd_flag & ND_NFSV4) { error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error) goto nfsmout; NFSSET_ATTRBIT(&savbits, &attrbits); NFSCLRNOTFILLABLE_ATTRBIT(&attrbits); NFSZERO_ATTRBIT(&rderrbits); NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR); } else { NFSZERO_ATTRBIT(&attrbits); } fullsiz = siz; nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1); if (!nd->nd_repstat) { if (off && verf != at.na_filerev) { /* * va_filerev is not sufficient as a cookie verifier, * since it is not supposed to change when entries are * removed/added unless that offset cookies returned to * the client are no longer valid. */ #if 0 if (nd->nd_flag & ND_NFSV4) { nd->nd_repstat = NFSERR_NOTSAME; } else { nd->nd_repstat = NFSERR_BAD_COOKIE; } #endif } else if ((nd->nd_flag & ND_NFSV4) && off == 0 && verf != 0) { nd->nd_repstat = NFSERR_BAD_COOKIE; } } if (!nd->nd_repstat && vp->v_type != VDIR) nd->nd_repstat = NFSERR_NOTDIR; if (!nd->nd_repstat && cnt == 0) nd->nd_repstat = NFSERR_TOOSMALL; if (!nd->nd_repstat) nd->nd_repstat = nfsvno_accchk(vp, VEXEC, nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE, NFSACCCHK_VPISLOCKED, NULL); if (nd->nd_repstat) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0; is_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs") == 0; MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); again: eofflag = 0; if (cookies) { free((caddr_t)cookies, M_TEMP); cookies = NULL; } iv.iov_base = rbuf; iv.iov_len = siz; io.uio_iov = &iv; io.uio_iovcnt = 1; io.uio_offset = (off_t)off; io.uio_resid = siz; io.uio_segflg = UIO_SYSSPACE; io.uio_rw = UIO_READ; io.uio_td = NULL; nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies, &cookies); off = (u_int64_t)io.uio_offset; if (io.uio_resid) siz -= io.uio_resid; getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1); if (!cookies && !nd->nd_repstat) nd->nd_repstat = NFSERR_PERM; if (!nd->nd_repstat) nd->nd_repstat = getret; if (nd->nd_repstat) { vput(vp); if (cookies) free((caddr_t)cookies, M_TEMP); free((caddr_t)rbuf, M_TEMP); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } /* * If nothing read, return eof * rpc reply */ if (siz == 0) { vput(vp); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); txdr_hyper(at.na_filerev, tl); tl += 2; *tl++ = newnfs_false; *tl = newnfs_true; free((caddr_t)cookies, M_TEMP); free((caddr_t)rbuf, M_TEMP); goto out; } /* * Check for degenerate cases of nothing useful read. * If so go try again */ cpos = rbuf; cend = rbuf + siz; dp = (struct dirent *)cpos; cookiep = cookies; /* * For some reason FreeBSD's ufs_readdir() chooses to back the * directory offset up to a block boundary, so it is necessary to * skip over the records that precede the requested offset. This * requires the assumption that file offset cookies monotonically * increase. */ while (cpos < cend && ncookies > 0 && (dp->d_fileno == 0 || dp->d_type == DT_WHT || (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff) || ((nd->nd_flag & ND_NFSV4) && ((dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) { cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } if (cpos >= cend || ncookies == 0) { siz = fullsiz; toff = off; goto again; } /* * Busy the file system so that the mount point won't go away * and, as such, VFS_VGET() can be used safely. */ mp = vp->v_mount; vfs_ref(mp); NFSVOPUNLOCK(vp, 0); nd->nd_repstat = vfs_busy(mp, 0); vfs_rel(mp); if (nd->nd_repstat != 0) { vrele(vp); free(cookies, M_TEMP); free(rbuf, M_TEMP); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); goto out; } /* * Check to see if entries in this directory can be safely acquired * via VFS_VGET() or if a switch to VOP_LOOKUP() is required. * ZFS snapshot directories need VOP_LOOKUP(), so that any * automount of the snapshot directory that is required will * be done. * This needs to be done here for NFSv4, since NFSv4 never does * a VFS_VGET() for "." or "..". */ if (is_zfs == 1) { r = VFS_VGET(mp, at.na_fileid, LK_SHARED, &nvp); if (r == EOPNOTSUPP) { usevget = 0; cn.cn_nameiop = LOOKUP; cn.cn_lkflags = LK_SHARED | LK_RETRY; cn.cn_cred = nd->nd_cred; cn.cn_thread = p; } else if (r == 0) vput(nvp); } /* * Save this position, in case there is an error before one entry * is created. */ mb0 = nd->nd_mb; bpos0 = nd->nd_bpos; /* * Fill in the first part of the reply. * dirlen is the reply length in bytes and cannot exceed cnt. * (Include the two booleans at the end of the reply in dirlen now, * so we recognize when we have exceeded cnt.) */ if (nd->nd_flag & ND_NFSV3) { dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED; nfsrv_postopattr(nd, getret, &at); } else { dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED; } NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); txdr_hyper(at.na_filerev, tl); /* * Save this position, in case there is an empty reply needed. */ mb1 = nd->nd_mb; bpos1 = nd->nd_bpos; /* Loop through the records and build reply */ entrycnt = 0; while (cpos < cend && ncookies > 0 && dirlen < cnt) { nlen = dp->d_namlen; if (dp->d_fileno != 0 && dp->d_type != DT_WHT && nlen <= NFS_MAXNAMLEN && ((nd->nd_flag & ND_NFSV3) || nlen > 2 || (nlen==2 && (dp->d_name[0]!='.' || dp->d_name[1]!='.')) || (nlen == 1 && dp->d_name[0] != '.'))) { /* * Save the current position in the reply, in case * this entry exceeds cnt. */ mb1 = nd->nd_mb; bpos1 = nd->nd_bpos; /* * For readdir_and_lookup get the vnode using * the file number. */ nvp = NULL; refp = NULL; r = 0; at_root = 0; needs_unbusy = 0; new_mp = mp; mounted_on_fileno = (uint64_t)dp->d_fileno; if ((nd->nd_flag & ND_NFSV3) || NFSNONZERO_ATTRBIT(&savbits)) { if (nd->nd_flag & ND_NFSV4) refp = nfsv4root_getreferral(NULL, vp, dp->d_fileno); if (refp == NULL) { if (usevget) r = VFS_VGET(mp, dp->d_fileno, LK_SHARED, &nvp); else r = EOPNOTSUPP; if (r == EOPNOTSUPP) { if (usevget) { usevget = 0; cn.cn_nameiop = LOOKUP; cn.cn_lkflags = LK_SHARED | LK_RETRY; cn.cn_cred = nd->nd_cred; cn.cn_thread = p; } cn.cn_nameptr = dp->d_name; cn.cn_namelen = nlen; cn.cn_flags = ISLASTCN | NOFOLLOW | LOCKLEAF; if (nlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.') cn.cn_flags |= ISDOTDOT; if (NFSVOPLOCK(vp, LK_SHARED) != 0) { nd->nd_repstat = EPERM; break; } if ((vp->v_vflag & VV_ROOT) != 0 && (cn.cn_flags & ISDOTDOT) != 0) { vref(vp); nvp = vp; r = 0; } else { r = VOP_LOOKUP(vp, &nvp, &cn); if (vp != nvp) NFSVOPUNLOCK(vp, 0); } } /* * For NFSv4, check to see if nvp is * a mount point and get the mount * point vnode, as required. */ if (r == 0 && nfsrv_enable_crossmntpt != 0 && (nd->nd_flag & ND_NFSV4) != 0 && nvp->v_type == VDIR && nvp->v_mountedhere != NULL) { new_mp = nvp->v_mountedhere; r = vfs_busy(new_mp, 0); vput(nvp); nvp = NULL; if (r == 0) { r = VFS_ROOT(new_mp, LK_SHARED, &nvp); needs_unbusy = 1; if (r == 0) at_root = 1; } } } if (!r) { if (refp == NULL && ((nd->nd_flag & ND_NFSV3) || NFSNONZERO_ATTRBIT(&attrbits))) { r = nfsvno_getfh(nvp, &nfh, p); if (!r) r = nfsvno_getattr(nvp, nvap, nd->nd_cred, p, 1); if (r == 0 && is_zfs == 1 && nfsrv_enable_crossmntpt != 0 && (nd->nd_flag & ND_NFSV4) != 0 && nvp->v_type == VDIR && vp->v_mount != nvp->v_mount) { /* * For a ZFS snapshot, there is a * pseudo mount that does not set * v_mountedhere, so it needs to * be detected via a different * mount structure. */ at_root = 1; if (new_mp == mp) new_mp = nvp->v_mount; } } } else { nvp = NULL; } if (r) { if (!NFSISSET_ATTRBIT(&attrbits, NFSATTRBIT_RDATTRERROR)) { if (nvp != NULL) vput(nvp); if (needs_unbusy != 0) vfs_unbusy(new_mp); nd->nd_repstat = r; break; } } } /* * Build the directory record xdr */ if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl++ = 0; *tl = txdr_unsigned(dp->d_fileno); dirlen += nfsm_strtom(nd, dp->d_name, nlen); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = 0; *tl = txdr_unsigned(*cookiep); nfsrv_postopattr(nd, 0, nvap); dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1); dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR); if (nvp != NULL) vput(nvp); } else { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = newnfs_true; *tl++ = 0; *tl = txdr_unsigned(*cookiep); dirlen += nfsm_strtom(nd, dp->d_name, nlen); if (nvp != NULL) { supports_nfsv4acls = nfs_supportsnfsv4acls(nvp); NFSVOPUNLOCK(nvp, 0); } else supports_nfsv4acls = 0; if (refp != NULL) { dirlen += nfsrv_putreferralattr(nd, &savbits, refp, 0, &nd->nd_repstat); if (nd->nd_repstat) { if (nvp != NULL) vrele(nvp); if (needs_unbusy != 0) vfs_unbusy(new_mp); break; } } else if (r) { dirlen += nfsvno_fillattr(nd, new_mp, nvp, nvap, &nfh, r, &rderrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, mounted_on_fileno); } else { dirlen += nfsvno_fillattr(nd, new_mp, nvp, nvap, &nfh, r, &attrbits, nd->nd_cred, p, isdgram, 0, supports_nfsv4acls, at_root, mounted_on_fileno); } if (nvp != NULL) vrele(nvp); dirlen += (3 * NFSX_UNSIGNED); } if (needs_unbusy != 0) vfs_unbusy(new_mp); if (dirlen <= cnt) entrycnt++; } cpos += dp->d_reclen; dp = (struct dirent *)cpos; cookiep++; ncookies--; } vrele(vp); vfs_unbusy(mp); /* * If dirlen > cnt, we must strip off the last entry. If that * results in an empty reply, report NFSERR_TOOSMALL. */ if (dirlen > cnt || nd->nd_repstat) { if (!nd->nd_repstat && entrycnt == 0) nd->nd_repstat = NFSERR_TOOSMALL; if (nd->nd_repstat) { newnfs_trimtrailing(nd, mb0, bpos0); if (nd->nd_flag & ND_NFSV3) nfsrv_postopattr(nd, getret, &at); } else newnfs_trimtrailing(nd, mb1, bpos1); eofflag = 0; } else if (cpos < cend) eofflag = 0; if (!nd->nd_repstat) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = newnfs_false; if (eofflag) *tl = newnfs_true; else *tl = newnfs_false; } FREE((caddr_t)cookies, M_TEMP); FREE((caddr_t)rbuf, M_TEMP); out: NFSEXITCODE2(0, nd); return (0); nfsmout: vput(vp); NFSEXITCODE2(error, nd); return (error); } /* * Get the settable attributes out of the mbuf list. * (Return 0 or EBADRPC) */ int nfsrv_sattr(struct nfsrv_descript *nd, struct nfsvattr *nvap, nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p) { u_int32_t *tl; struct nfsv2_sattr *sp; int error = 0, toclient = 0; switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) { case ND_NFSV2: NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR); /* * Some old clients didn't fill in the high order 16bits. * --> check the low order 2 bytes for 0xffff */ if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff) nvap->na_mode = nfstov_mode(sp->sa_mode); if (sp->sa_uid != newnfs_xdrneg1) nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid); if (sp->sa_gid != newnfs_xdrneg1) nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid); if (sp->sa_size != newnfs_xdrneg1) nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size); if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) { #ifdef notyet fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime); #else nvap->na_atime.tv_sec = fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec); nvap->na_atime.tv_nsec = 0; #endif } if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1) fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime); break; case ND_NFSV3: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_mode = nfstov_mode(*tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_uid = fxdr_unsigned(uid_t, *tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_gid = fxdr_unsigned(gid_t, *tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); nvap->na_size = fxdr_hyper(tl); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fxdr_nfsv3time(tl, &nvap->na_atime); toclient = 1; break; case NFSV3SATTRTIME_TOSERVER: vfs_timestamp(&nvap->na_atime); nvap->na_vaflags |= VA_UTIMES_NULL; break; }; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); fxdr_nfsv3time(tl, &nvap->na_mtime); nvap->na_vaflags &= ~VA_UTIMES_NULL; break; case NFSV3SATTRTIME_TOSERVER: vfs_timestamp(&nvap->na_mtime); if (!toclient) nvap->na_vaflags |= VA_UTIMES_NULL; break; }; break; case ND_NFSV4: error = nfsv4_sattr(nd, nvap, attrbitp, aclp, p); }; nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Handle the setable attributes for V4. * Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise. */ int nfsv4_sattr(struct nfsrv_descript *nd, struct nfsvattr *nvap, nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p) { u_int32_t *tl; int attrsum = 0; int i, j; int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0; int toclient = 0; u_char *cp, namestr[NFSV4_SMALLSTR + 1]; uid_t uid; gid_t gid; error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsize = fxdr_unsigned(int, *tl); /* * Loop around getting the setable attributes. If an unsupported * one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return. */ if (retnotsup) { nd->nd_repstat = NFSERR_ATTRNOTSUPP; bitpos = NFSATTRBIT_MAX; } else { bitpos = 0; } for (; bitpos < NFSATTRBIT_MAX; bitpos++) { if (attrsum > attrsize) { error = NFSERR_BADXDR; goto nfsmout; } if (NFSISSET_ATTRBIT(attrbitp, bitpos)) switch (bitpos) { case NFSATTRBIT_SIZE: NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); nvap->na_size = fxdr_hyper(tl); attrsum += NFSX_HYPER; break; case NFSATTRBIT_ACL: error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize, p); if (error) goto nfsmout; if (aceerr && !nd->nd_repstat) nd->nd_repstat = aceerr; attrsum += aclsize; break; case NFSATTRBIT_ARCHIVE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_HIDDEN: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_MIMETYPE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); i = fxdr_unsigned(int, *tl); error = nfsm_advance(nd, NFSM_RNDUP(i), -1); if (error) goto nfsmout; if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i)); break; case NFSATTRBIT_MODE: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); nvap->na_mode = nfstov_mode(*tl); attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_OWNER: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 0) { error = NFSERR_BADXDR; goto nfsmout; } if (j > NFSV4_SMALLSTR) cp = malloc(j + 1, M_NFSSTRING, M_WAITOK); else cp = namestr; error = nfsrv_mtostr(nd, cp, j); if (error) { if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); goto nfsmout; } if (!nd->nd_repstat) { nd->nd_repstat = nfsv4_strtouid(nd, cp, j, &uid, p); if (!nd->nd_repstat) nvap->na_uid = uid; } if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); break; case NFSATTRBIT_OWNERGROUP: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 0) { error = NFSERR_BADXDR; goto nfsmout; } if (j > NFSV4_SMALLSTR) cp = malloc(j + 1, M_NFSSTRING, M_WAITOK); else cp = namestr; error = nfsrv_mtostr(nd, cp, j); if (error) { if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); goto nfsmout; } if (!nd->nd_repstat) { nd->nd_repstat = nfsv4_strtogid(nd, cp, j, &gid, p); if (!nd->nd_repstat) nvap->na_gid = gid; } if (j > NFSV4_SMALLSTR) free(cp, M_NFSSTRING); attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j)); break; case NFSATTRBIT_SYSTEM: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_UNSIGNED; break; case NFSATTRBIT_TIMEACCESSSET: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &nvap->na_atime); toclient = 1; attrsum += NFSX_V4TIME; } else { vfs_timestamp(&nvap->na_atime); nvap->na_vaflags |= VA_UTIMES_NULL; } break; case NFSATTRBIT_TIMEBACKUP: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMECREATE: NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); if (!nd->nd_repstat) nd->nd_repstat = NFSERR_ATTRNOTSUPP; attrsum += NFSX_V4TIME; break; case NFSATTRBIT_TIMEMODIFYSET: NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); attrsum += NFSX_UNSIGNED; if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME); fxdr_nfsv4time(tl, &nvap->na_mtime); nvap->na_vaflags &= ~VA_UTIMES_NULL; attrsum += NFSX_V4TIME; } else { vfs_timestamp(&nvap->na_mtime); if (!toclient) nvap->na_vaflags |= VA_UTIMES_NULL; } break; default: nd->nd_repstat = NFSERR_ATTRNOTSUPP; /* * set bitpos so we drop out of the loop. */ bitpos = NFSATTRBIT_MAX; break; }; } /* * some clients pad the attrlist, so we need to skip over the * padding. */ if (attrsum > attrsize) { error = NFSERR_BADXDR; } else { attrsize = NFSM_RNDUP(attrsize); if (attrsum < attrsize) error = nfsm_advance(nd, attrsize - attrsum, -1); } nfsmout: NFSEXITCODE2(error, nd); return (error); } /* * Check/setup export credentials. */ int nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp, struct ucred *credanon) { int error = 0; /* * Check/setup credentials. */ if (nd->nd_flag & ND_GSS) exp->nes_exflag &= ~MNT_EXPORTANON; /* * Check to see if the operation is allowed for this security flavor. * RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to * AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS. * Also, allow Secinfo, so that it can acquire the correct flavor(s). */ if (nfsvno_testexp(nd, exp) && nd->nd_procnum != NFSV4OP_SECINFO && nd->nd_procnum != NFSPROC_FSINFO) { if (nd->nd_flag & ND_NFSV4) error = NFSERR_WRONGSEC; else error = (NFSERR_AUTHERR | AUTH_TOOWEAK); goto out; } /* * Check to see if the file system is exported V4 only. */ if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) { error = NFSERR_PROGNOTV4; goto out; } /* * Now, map the user credentials. * (Note that ND_AUTHNONE will only be set for an NFSv3 * Fsinfo RPC. If set for anything else, this code might need * to change.) */ if (NFSVNO_EXPORTED(exp) && ((!(nd->nd_flag & ND_GSS) && nd->nd_cred->cr_uid == 0) || NFSVNO_EXPORTANON(exp) || (nd->nd_flag & ND_AUTHNONE))) { nd->nd_cred->cr_uid = credanon->cr_uid; nd->nd_cred->cr_gid = credanon->cr_gid; crsetgroups(nd->nd_cred, credanon->cr_ngroups, credanon->cr_groups); } out: NFSEXITCODE2(error, nd); return (error); } /* * Check exports. */ int nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp, struct ucred **credp) { int i, error, *secflavors; error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp, &exp->nes_numsecflavor, &secflavors); if (error) { if (nfs_rootfhset) { exp->nes_exflag = 0; exp->nes_numsecflavor = 0; error = 0; } } else { /* Copy the security flavors. */ for (i = 0; i < exp->nes_numsecflavor; i++) exp->nes_secflavors[i] = secflavors[i]; } NFSEXITCODE(error); return (error); } /* * Get a vnode for a file handle and export stuff. */ int nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam, int lktype, struct vnode **vpp, struct nfsexstuff *exp, struct ucred **credp) { int i, error, *secflavors; *credp = NULL; exp->nes_numsecflavor = 0; error = VFS_FHTOVP(mp, &fhp->fh_fid, lktype, vpp); if (error != 0) /* Make sure the server replies ESTALE to the client. */ error = ESTALE; if (nam && !error) { error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp, &exp->nes_numsecflavor, &secflavors); if (error) { if (nfs_rootfhset) { exp->nes_exflag = 0; exp->nes_numsecflavor = 0; error = 0; } else { vput(*vpp); } } else { /* Copy the security flavors. */ for (i = 0; i < exp->nes_numsecflavor; i++) exp->nes_secflavors[i] = secflavors[i]; } } NFSEXITCODE(error); return (error); } /* * nfsd_fhtovp() - convert a fh to a vnode ptr * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling nfsvno_fhtovp() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * for AUTH_SYS * - if mpp != NULL, return the mount point so that it can * be used for vn_finished_write() by the caller */ void nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype, struct vnode **vpp, struct nfsexstuff *exp, struct mount **mpp, int startwrite, struct thread *p) { struct mount *mp; struct ucred *credanon; fhandle_t *fhp; fhp = (fhandle_t *)nfp->nfsrvfh_data; /* * Check for the special case of the nfsv4root_fh. */ mp = vfs_busyfs(&fhp->fh_fsid); if (mpp != NULL) *mpp = mp; if (mp == NULL) { *vpp = NULL; nd->nd_repstat = ESTALE; goto out; } if (startwrite) { vn_start_write(NULL, mpp, V_WAIT); if (lktype == LK_SHARED && !(MNT_SHARED_WRITES(mp))) lktype = LK_EXCLUSIVE; } nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp, &credanon); vfs_unbusy(mp); /* * For NFSv4 without a pseudo root fs, unexported file handles * can be returned, so that Lookup works everywhere. */ if (!nd->nd_repstat && exp->nes_exflag == 0 && !(nd->nd_flag & ND_NFSV4)) { vput(*vpp); nd->nd_repstat = EACCES; } /* * Personally, I've never seen any point in requiring a * reserved port#, since only in the rare case where the * clients are all boxes with secure system priviledges, * does it provide any enhanced security, but... some people * believe it to be useful and keep putting this code back in. * (There is also some "security checker" out there that * complains if the nfs server doesn't enforce this.) * However, note the following: * RFC3530 (NFSv4) specifies that a reserved port# not be * required. * RFC2623 recommends that, if a reserved port# is checked for, * that there be a way to turn that off--> ifdef'd. */ #ifdef NFS_REQRSVPORT if (!nd->nd_repstat) { struct sockaddr_in *saddr; struct sockaddr_in6 *saddr6; saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *); saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *); if (!(nd->nd_flag & ND_NFSV4) && ((saddr->sin_family == AF_INET && ntohs(saddr->sin_port) >= IPPORT_RESERVED) || (saddr6->sin6_family == AF_INET6 && ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) { vput(*vpp); nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); } } #endif /* NFS_REQRSVPORT */ /* * Check/setup credentials. */ if (!nd->nd_repstat) { nd->nd_saveduid = nd->nd_cred->cr_uid; nd->nd_repstat = nfsd_excred(nd, exp, credanon); if (nd->nd_repstat) vput(*vpp); } if (credanon != NULL) crfree(credanon); if (nd->nd_repstat) { if (startwrite) vn_finished_write(mp); *vpp = NULL; if (mpp != NULL) *mpp = NULL; } out: NFSEXITCODE2(0, nd); } /* * glue for fp. */ static int fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp) { struct filedesc *fdp; struct file *fp; int error = 0; fdp = p->td_proc->p_fd; if (fd < 0 || fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd].fde_file) == NULL) { error = EBADF; goto out; } *fpp = fp; out: NFSEXITCODE(error); return (error); } /* * Called from nfssvc() to update the exports list. Just call * vfs_export(). This has to be done, since the v4 root fake fs isn't * in the mount list. */ int nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p) { struct nfsex_args *nfsexargp = (struct nfsex_args *)argp; int error = 0; struct nameidata nd; fhandle_t fh; error = vfs_export(&nfsv4root_mnt, &nfsexargp->export); if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0) nfs_rootfhset = 0; else if (error == 0) { if (nfsexargp->fspec == NULL) { error = EPERM; goto out; } /* * If fspec != NULL, this is the v4root path. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, nfsexargp->fspec, p); if ((error = namei(&nd)) != 0) goto out; error = nfsvno_getfh(nd.ni_vp, &fh, p); vrele(nd.ni_vp); if (!error) { nfs_rootfh.nfsrvfh_len = NFSX_MYFH; NFSBCOPY((caddr_t)&fh, nfs_rootfh.nfsrvfh_data, sizeof (fhandle_t)); nfs_rootfhset = 1; } } out: NFSEXITCODE(error); return (error); } /* * This function needs to test to see if the system is near its limit * for memory allocation via malloc() or mget() and return True iff * either of these resources are near their limit. * XXX (For now, this is just a stub.) */ int nfsrv_testmalloclimit = 0; int nfsrv_mallocmget_limit(void) { static int printmesg = 0; static int testval = 1; if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) { if ((printmesg++ % 100) == 0) printf("nfsd: malloc/mget near limit\n"); return (1); } return (0); } /* * BSD specific initialization of a mount point. */ void nfsd_mntinit(void) { static int inited = 0; if (inited) return; inited = 1; nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED); TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist); TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist); nfsv4root_mnt.mnt_export = NULL; TAILQ_INIT(&nfsv4root_opt); TAILQ_INIT(&nfsv4root_newopt); nfsv4root_mnt.mnt_opt = &nfsv4root_opt; nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt; nfsv4root_mnt.mnt_nvnodelistsize = 0; nfsv4root_mnt.mnt_activevnodelistsize = 0; } /* * Get a vnode for a file handle, without checking exports, etc. */ struct vnode * nfsvno_getvp(fhandle_t *fhp) { struct mount *mp; struct vnode *vp; int error; mp = vfs_busyfs(&fhp->fh_fsid); if (mp == NULL) return (NULL); error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error) return (NULL); return (vp); } /* * Do a local VOP_ADVLOCK(). */ int nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first, u_int64_t end, struct thread *td) { int error = 0; struct flock fl; u_int64_t tlen; if (nfsrv_dolocallocks == 0) goto out; ASSERT_VOP_UNLOCKED(vp, "nfsvno_advlock: vp locked"); fl.l_whence = SEEK_SET; fl.l_type = ftype; fl.l_start = (off_t)first; if (end == NFS64BITSSET) { fl.l_len = 0; } else { tlen = end - first; fl.l_len = (off_t)tlen; } /* * For FreeBSD8, the l_pid and l_sysid must be set to the same * values for all calls, so that all locks will be held by the * nfsd server. (The nfsd server handles conflicts between the * various clients.) * Since an NFSv4 lockowner is a ClientID plus an array of up to 1024 * bytes, so it can't be put in l_sysid. */ if (nfsv4_sysid == 0) nfsv4_sysid = nlm_acquire_next_sysid(); fl.l_pid = (pid_t)0; fl.l_sysid = (int)nfsv4_sysid; if (ftype == F_UNLCK) error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl, (F_POSIX | F_REMOTE)); else error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl, (F_POSIX | F_REMOTE)); out: NFSEXITCODE(error); return (error); } /* * Check the nfsv4 root exports. */ int nfsvno_v4rootexport(struct nfsrv_descript *nd) { struct ucred *credanon; int exflags, error = 0, numsecflavor, *secflavors, i; error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags, &credanon, &numsecflavor, &secflavors); if (error) { error = NFSERR_PROGUNAVAIL; goto out; } if (credanon != NULL) crfree(credanon); for (i = 0; i < numsecflavor; i++) { if (secflavors[i] == AUTH_SYS) nd->nd_flag |= ND_EXAUTHSYS; else if (secflavors[i] == RPCSEC_GSS_KRB5) nd->nd_flag |= ND_EXGSS; else if (secflavors[i] == RPCSEC_GSS_KRB5I) nd->nd_flag |= ND_EXGSSINTEGRITY; else if (secflavors[i] == RPCSEC_GSS_KRB5P) nd->nd_flag |= ND_EXGSSPRIVACY; } out: NFSEXITCODE(error); return (error); } /* * Nfs server psuedo system call for the nfsd's */ /* * MPSAFE */ static int nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap) { struct file *fp; struct nfsd_addsock_args sockarg; struct nfsd_nfsd_args nfsdarg; cap_rights_t rights; int error; if (uap->flag & NFSSVC_NFSDADDSOCK) { error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg)); if (error) goto out; /* * Since we don't know what rights might be required, * pretend that we need them all. It is better to be too * careful than too reckless. */ error = fget(td, sockarg.sock, cap_rights_init(&rights, CAP_SOCK_SERVER), &fp); if (error != 0) goto out; if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); error = EPERM; goto out; } error = nfsrvd_addsock(fp); fdrop(fp, td); } else if (uap->flag & NFSSVC_NFSDNFSD) { if (uap->argp == NULL) { error = EINVAL; goto out; } error = copyin(uap->argp, (caddr_t)&nfsdarg, sizeof (nfsdarg)); if (error) goto out; error = nfsrvd_nfsd(td, &nfsdarg); } else { error = nfssvc_srvcall(td, uap, td->td_ucred); } out: NFSEXITCODE(error); return (error); } static int nfssvc_srvcall(struct thread *p, struct nfssvc_args *uap, struct ucred *cred) { struct nfsex_args export; struct file *fp = NULL; int stablefd, len; struct nfsd_clid adminrevoke; struct nfsd_dumplist dumplist; struct nfsd_dumpclients *dumpclients; struct nfsd_dumplocklist dumplocklist; struct nfsd_dumplocks *dumplocks; struct nameidata nd; vnode_t vp; int error = EINVAL, igotlock; struct proc *procp; static int suspend_nfsd = 0; if (uap->flag & NFSSVC_PUBLICFH) { NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t)); error = copyin(uap->argp, &nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t)); if (!error) nfs_pubfhset = 1; } else if (uap->flag & NFSSVC_V4ROOTEXPORT) { error = copyin(uap->argp,(caddr_t)&export, sizeof (struct nfsex_args)); if (!error) error = nfsrv_v4rootexport(&export, cred, p); } else if (uap->flag & NFSSVC_NOPUBLICFH) { nfs_pubfhset = 0; error = 0; } else if (uap->flag & NFSSVC_STABLERESTART) { error = copyin(uap->argp, (caddr_t)&stablefd, sizeof (int)); if (!error) error = fp_getfvp(p, stablefd, &fp, &vp); if (!error && (NFSFPFLAG(fp) & (FREAD | FWRITE)) != (FREAD | FWRITE)) error = EBADF; if (!error && newnfs_numnfsd != 0) error = EPERM; if (!error) { nfsrv_stablefirst.nsf_fp = fp; nfsrv_setupstable(p); } } else if (uap->flag & NFSSVC_ADMINREVOKE) { error = copyin(uap->argp, (caddr_t)&adminrevoke, sizeof (struct nfsd_clid)); if (!error) error = nfsrv_adminrevoke(&adminrevoke, p); } else if (uap->flag & NFSSVC_DUMPCLIENTS) { error = copyin(uap->argp, (caddr_t)&dumplist, sizeof (struct nfsd_dumplist)); if (!error && (dumplist.ndl_size < 1 || dumplist.ndl_size > NFSRV_MAXDUMPLIST)) error = EPERM; if (!error) { len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size; dumpclients = (struct nfsd_dumpclients *)malloc(len, M_TEMP, M_WAITOK); nfsrv_dumpclients(dumpclients, dumplist.ndl_size); error = copyout(dumpclients, CAST_USER_ADDR_T(dumplist.ndl_list), len); free((caddr_t)dumpclients, M_TEMP); } } else if (uap->flag & NFSSVC_DUMPLOCKS) { error = copyin(uap->argp, (caddr_t)&dumplocklist, sizeof (struct nfsd_dumplocklist)); if (!error && (dumplocklist.ndllck_size < 1 || dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST)) error = EPERM; if (!error) error = nfsrv_lookupfilename(&nd, dumplocklist.ndllck_fname, p); if (!error) { len = sizeof (struct nfsd_dumplocks) * dumplocklist.ndllck_size; dumplocks = (struct nfsd_dumplocks *)malloc(len, M_TEMP, M_WAITOK); nfsrv_dumplocks(nd.ni_vp, dumplocks, dumplocklist.ndllck_size, p); vput(nd.ni_vp); error = copyout(dumplocks, CAST_USER_ADDR_T(dumplocklist.ndllck_list), len); free((caddr_t)dumplocks, M_TEMP); } } else if (uap->flag & NFSSVC_BACKUPSTABLE) { procp = p->td_proc; PROC_LOCK(procp); nfsd_master_pid = procp->p_pid; bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1); nfsd_master_start = procp->p_stats->p_start; nfsd_master_proc = procp; PROC_UNLOCK(procp); } else if ((uap->flag & NFSSVC_SUSPENDNFSD) != 0) { NFSLOCKV4ROOTMUTEX(); if (suspend_nfsd == 0) { /* Lock out all nfsd threads */ do { igotlock = nfsv4_lock(&nfsd_suspend_lock, 1, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL); } while (igotlock == 0 && suspend_nfsd == 0); suspend_nfsd = 1; } NFSUNLOCKV4ROOTMUTEX(); error = 0; } else if ((uap->flag & NFSSVC_RESUMENFSD) != 0) { NFSLOCKV4ROOTMUTEX(); if (suspend_nfsd != 0) { nfsv4_unlock(&nfsd_suspend_lock, 0); suspend_nfsd = 0; } NFSUNLOCKV4ROOTMUTEX(); error = 0; } NFSEXITCODE(error); return (error); } /* * Check exports. * Returns 0 if ok, 1 otherwise. */ int nfsvno_testexp(struct nfsrv_descript *nd, struct nfsexstuff *exp) { int i; /* * This seems odd, but allow the case where the security flavor * list is empty. This happens when NFSv4 is traversing non-exported * file systems. Exported file systems should always have a non-empty * security flavor list. */ if (exp->nes_numsecflavor == 0) return (0); for (i = 0; i < exp->nes_numsecflavor; i++) { /* * The tests for privacy and integrity must be first, * since ND_GSS is set for everything but AUTH_SYS. */ if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P && (nd->nd_flag & ND_GSSPRIVACY)) return (0); if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I && (nd->nd_flag & ND_GSSINTEGRITY)) return (0); if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 && (nd->nd_flag & ND_GSS)) return (0); if (exp->nes_secflavors[i] == AUTH_SYS && (nd->nd_flag & ND_GSS) == 0) return (0); } return (1); } /* * Calculate a hash value for the fid in a file handle. */ uint32_t nfsrv_hashfh(fhandle_t *fhp) { uint32_t hashval; hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0); return (hashval); } /* * Calculate a hash value for the sessionid. */ uint32_t nfsrv_hashsessionid(uint8_t *sessionid) { uint32_t hashval; hashval = hash32_buf(sessionid, NFSX_V4SESSIONID, 0); return (hashval); } /* * Signal the userland master nfsd to backup the stable restart file. */ void nfsrv_backupstable(void) { struct proc *procp; if (nfsd_master_proc != NULL) { procp = pfind(nfsd_master_pid); /* Try to make sure it is the correct process. */ if (procp == nfsd_master_proc && procp->p_stats->p_start.tv_sec == nfsd_master_start.tv_sec && procp->p_stats->p_start.tv_usec == nfsd_master_start.tv_usec && strcmp(procp->p_comm, nfsd_master_comm) == 0) kern_psignal(procp, SIGUSR2); else nfsd_master_proc = NULL; if (procp != NULL) PROC_UNLOCK(procp); } } extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *); /* * Called once to initialize data structures... */ static int nfsd_modevent(module_t mod, int type, void *data) { int error = 0, i; static int loaded = 0; switch (type) { case MOD_LOAD: if (loaded) goto out; newnfs_portinit(); for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { mtx_init(&nfsrchash_table[i].mtx, "nfsrtc", NULL, MTX_DEF); mtx_init(&nfsrcahash_table[i].mtx, "nfsrtca", NULL, MTX_DEF); } mtx_init(&nfsrc_udpmtx, "nfsuc", NULL, MTX_DEF); mtx_init(&nfs_v4root_mutex, "nfs4rt", NULL, MTX_DEF); mtx_init(&nfsv4root_mnt.mnt_mtx, "nfs4mnt", NULL, MTX_DEF); for (i = 0; i < NFSSESSIONHASHSIZE; i++) mtx_init(&nfssessionhash[i].mtx, "nfssm", NULL, MTX_DEF); lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0); nfsrvd_initcache(); nfsd_init(); NFSD_LOCK(); nfsrvd_init(0); NFSD_UNLOCK(); nfsd_mntinit(); #ifdef VV_DISABLEDELEG vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation; vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation; #endif nfsd_call_servertimer = nfsrv_servertimer; nfsd_call_nfsd = nfssvc_nfsd; loaded = 1; break; case MOD_UNLOAD: if (newnfs_numnfsd != 0) { error = EBUSY; break; } #ifdef VV_DISABLEDELEG vn_deleg_ops.vndeleg_recall = NULL; vn_deleg_ops.vndeleg_disable = NULL; #endif nfsd_call_servertimer = NULL; nfsd_call_nfsd = NULL; /* Clean out all NFSv4 state. */ nfsrv_throwawayallstate(curthread); /* Clean the NFS server reply cache */ nfsrvd_cleancache(); /* Free up the krpc server pool. */ if (nfsrvd_pool != NULL) svcpool_destroy(nfsrvd_pool); /* and get rid of the locks */ for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) { mtx_destroy(&nfsrchash_table[i].mtx); mtx_destroy(&nfsrcahash_table[i].mtx); } mtx_destroy(&nfsrc_udpmtx); mtx_destroy(&nfs_v4root_mutex); mtx_destroy(&nfsv4root_mnt.mnt_mtx); for (i = 0; i < NFSSESSIONHASHSIZE; i++) mtx_destroy(&nfssessionhash[i].mtx); lockdestroy(&nfsv4root_mnt.mnt_explock); loaded = 0; break; default: error = EOPNOTSUPP; break; } out: NFSEXITCODE(error); return (error); } static moduledata_t nfsd_mod = { "nfsd", nfsd_modevent, NULL, }; DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfsd, 1); MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1); MODULE_DEPEND(nfsd, nfslock, 1, 1, 1); MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1); MODULE_DEPEND(nfsd, krpc, 1, 1, 1); MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1); Index: stable/10/sys/fs/nullfs/null_vfsops.c =================================================================== --- stable/10/sys/fs/nullfs/null_vfsops.c (revision 282269) +++ stable/10/sys/fs/nullfs/null_vfsops.c (revision 282270) @@ -1,454 +1,456 @@ /*- * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)null_vfsops.c 8.2 (Berkeley) 1/21/94 * * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 * $FreeBSD$ */ /* * Null Layer * (See null_vnops.c for a description of what this does.) */ #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NULLFSMNT, "nullfs_mount", "NULLFS mount structure"); static vfs_fhtovp_t nullfs_fhtovp; static vfs_mount_t nullfs_mount; static vfs_quotactl_t nullfs_quotactl; static vfs_root_t nullfs_root; static vfs_sync_t nullfs_sync; static vfs_statfs_t nullfs_statfs; static vfs_unmount_t nullfs_unmount; static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; /* * Mount null layer */ static int nullfs_mount(struct mount *mp) { int error = 0; struct vnode *lowerrootvp, *vp; struct vnode *nullm_rootvp; struct null_mount *xmp; struct thread *td = curthread; char *target; int isvnunlocked = 0, len; struct nameidata nd, *ndp = &nd; NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp); if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_NULLFS)) return (EPERM); if (mp->mnt_flag & MNT_ROOTFS) return (EOPNOTSUPP); /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) { /* * Only support update mounts for NFS export. */ if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) return (0); else return (EOPNOTSUPP); } /* * Get argument */ error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len); if (error || target[len - 1] != '\0') return (EINVAL); /* * Unlock lower node to avoid possible deadlock. */ if ((mp->mnt_vnodecovered->v_op == &null_vnodeops) && VOP_ISLOCKED(mp->mnt_vnodecovered) == LK_EXCLUSIVE) { VOP_UNLOCK(mp->mnt_vnodecovered, 0); isvnunlocked = 1; } /* * Find lower node */ NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target, curthread); error = namei(ndp); /* * Re-lock vnode. * XXXKIB This is deadlock-prone as well. */ if (isvnunlocked) vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); /* * Sanity check on lower vnode */ lowerrootvp = ndp->ni_vp; /* * Check multi null mount to avoid `lock against myself' panic. */ if (lowerrootvp == VTONULL(mp->mnt_vnodecovered)->null_lowervp) { NULLFSDEBUG("nullfs_mount: multi null mount?\n"); vput(lowerrootvp); return (EDEADLK); } xmp = (struct null_mount *) malloc(sizeof(struct null_mount), M_NULLFSMNT, M_WAITOK | M_ZERO); /* * Save reference to underlying FS */ xmp->nullm_vfs = lowerrootvp->v_mount; /* * Save reference. Each mount also holds * a reference on the root vnode. */ error = null_nodeget(mp, lowerrootvp, &vp); /* * Make sure the node alias worked */ if (error) { free(xmp, M_NULLFSMNT); return (error); } /* * Keep a held reference to the root vnode. * It is vrele'd in nullfs_unmount. */ nullm_rootvp = vp; nullm_rootvp->v_vflag |= VV_ROOT; xmp->nullm_rootvp = nullm_rootvp; /* * Unlock the node (either the lower or the alias) */ VOP_UNLOCK(vp, 0); if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); } xmp->nullm_flags |= NULLM_CACHE; if (vfs_getopt(mp->mnt_optnew, "nocache", NULL, NULL) == 0) xmp->nullm_flags &= ~NULLM_CACHE; MNT_ILOCK(mp); if ((xmp->nullm_flags & NULLM_CACHE) != 0) { mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & (MNTK_SHARED_WRITES | MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED); } mp->mnt_kern_flag |= MNTK_LOOKUP_EXCL_DOTDOT; + mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & + MNTK_USES_BCACHE; MNT_IUNLOCK(mp); mp->mnt_data = xmp; vfs_getnewfsid(mp); if ((xmp->nullm_flags & NULLM_CACHE) != 0) { MNT_ILOCK(xmp->nullm_vfs); TAILQ_INSERT_TAIL(&xmp->nullm_vfs->mnt_uppers, mp, mnt_upper_link); MNT_IUNLOCK(xmp->nullm_vfs); } vfs_mountedfrom(mp, target); NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n", mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); return (0); } /* * Free reference to null layer */ static int nullfs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct null_mount *mntdata; struct mount *ump; int error, flags; NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp); if (mntflags & MNT_FORCE) flags = FORCECLOSE; else flags = 0; /* There is 1 extra root vnode reference (nullm_rootvp). */ error = vflush(mp, 1, flags, curthread); if (error) return (error); /* * Finally, throw away the null_mount structure */ mntdata = mp->mnt_data; ump = mntdata->nullm_vfs; if ((mntdata->nullm_flags & NULLM_CACHE) != 0) { MNT_ILOCK(ump); while ((ump->mnt_kern_flag & MNTK_VGONE_UPPER) != 0) { ump->mnt_kern_flag |= MNTK_VGONE_WAITER; msleep(&ump->mnt_uppers, &ump->mnt_mtx, 0, "vgnupw", 0); } TAILQ_REMOVE(&ump->mnt_uppers, mp, mnt_upper_link); MNT_IUNLOCK(ump); } mp->mnt_data = NULL; free(mntdata, M_NULLFSMNT); return (0); } static int nullfs_root(mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { struct vnode *vp; NULLFSDEBUG("nullfs_root(mp = %p, vp = %p->%p)\n", (void *)mp, (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp, (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp)); /* * Return locked reference to root. */ vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; VREF(vp); ASSERT_VOP_UNLOCKED(vp, "root vnode is locked"); vn_lock(vp, flags | LK_RETRY); *vpp = vp; return 0; } static int nullfs_quotactl(mp, cmd, uid, arg) struct mount *mp; int cmd; uid_t uid; void *arg; { return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg); } static int nullfs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { int error; struct statfs mstat; NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p->%p)\n", (void *)mp, (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp, (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp)); bzero(&mstat, sizeof(mstat)); error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat); if (error) return (error); /* now copy across the "interesting" information and fake the rest */ sbp->f_type = mstat.f_type; sbp->f_flags = (sbp->f_flags & (MNT_RDONLY | MNT_NOEXEC | MNT_NOSUID | MNT_UNION | MNT_NOSYMFOLLOW)) | (mstat.f_flags & ~MNT_ROOTFS); sbp->f_bsize = mstat.f_bsize; sbp->f_iosize = mstat.f_iosize; sbp->f_blocks = mstat.f_blocks; sbp->f_bfree = mstat.f_bfree; sbp->f_bavail = mstat.f_bavail; sbp->f_files = mstat.f_files; sbp->f_ffree = mstat.f_ffree; return (0); } static int nullfs_sync(mp, waitfor) struct mount *mp; int waitfor; { /* * XXX - Assumes no data cached at null layer. */ return (0); } static int nullfs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { int error; KASSERT((flags & LK_TYPE_MASK) != 0, ("nullfs_vget: no lock requested")); error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, flags, vpp); if (error != 0) return (error); return (null_nodeget(mp, *vpp, vpp)); } static int nullfs_fhtovp(mp, fidp, flags, vpp) struct mount *mp; struct fid *fidp; int flags; struct vnode **vpp; { int error; error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, flags, vpp); if (error != 0) return (error); return (null_nodeget(mp, *vpp, vpp)); } static int nullfs_extattrctl(mp, cmd, filename_vp, namespace, attrname) struct mount *mp; int cmd; struct vnode *filename_vp; int namespace; const char *attrname; { return (VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, filename_vp, namespace, attrname)); } static void nullfs_reclaim_lowervp(struct mount *mp, struct vnode *lowervp) { struct vnode *vp; vp = null_hashget(mp, lowervp); if (vp == NULL) return; VTONULL(vp)->null_flags |= NULLV_NOUNLOCK; vgone(vp); vput(vp); } static void nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp) { struct vnode *vp; struct null_node *xp; vp = null_hashget(mp, lowervp); if (vp == NULL) return; xp = VTONULL(vp); xp->null_flags |= NULLV_DROP | NULLV_NOUNLOCK; vhold(vp); vunref(vp); if (vp->v_usecount == 0) { /* * If vunref() dropped the last use reference on the * nullfs vnode, it must be reclaimed, and its lock * was split from the lower vnode lock. Need to do * extra unlock before allowing the final vdrop() to * free the vnode. */ KASSERT((vp->v_iflag & VI_DOOMED) != 0, ("not reclaimed nullfs vnode %p", vp)); VOP_UNLOCK(vp, 0); } else { /* * Otherwise, the nullfs vnode still shares the lock * with the lower vnode, and must not be unlocked. * Also clear the NULLV_NOUNLOCK, the flag is not * relevant for future reclamations. */ ASSERT_VOP_ELOCKED(vp, "unlink_lowervp"); KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("reclaimed nullfs vnode %p", vp)); xp->null_flags &= ~NULLV_NOUNLOCK; } vdrop(vp); } static struct vfsops null_vfsops = { .vfs_extattrctl = nullfs_extattrctl, .vfs_fhtovp = nullfs_fhtovp, .vfs_init = nullfs_init, .vfs_mount = nullfs_mount, .vfs_quotactl = nullfs_quotactl, .vfs_root = nullfs_root, .vfs_statfs = nullfs_statfs, .vfs_sync = nullfs_sync, .vfs_uninit = nullfs_uninit, .vfs_unmount = nullfs_unmount, .vfs_vget = nullfs_vget, .vfs_reclaim_lowervp = nullfs_reclaim_lowervp, .vfs_unlink_lowervp = nullfs_unlink_lowervp, }; VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL); Index: stable/10/sys/kern/vfs_subr.c =================================================================== --- stable/10/sys/kern/vfs_subr.c (revision 282269) +++ stable/10/sys/kern/vfs_subr.c (revision 282270) @@ -1,4880 +1,4881 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vnlru_free(int); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. */ static unsigned long numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static u_long vnodes_created; SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 0, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of vnodes in the free list"); static int vlru_allow_cache_src; SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); static u_long recycles_count; SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, "Number of vnodes recycled to avoid exceding kern.maxvnodes"); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "Number of calls to reassignbuf"); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ static int vnsz2log; /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return uma_zalloc(buf_trie_zone, M_NOWAIT); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree(buf_trie_zone, node); } PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of physical pages to vnodes approaches sixteen to one. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) #endif static void vntblinit(void *dummy __unused) { u_int i; int physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to physical pages * is one to four until desiredvnodes exceeds 98,304. Thereafter, the * marginal ratio of desiredvnodes to physical pages is one to * sixteen. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * may not exceed one seventh of the kernel's heap size. */ physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4, cnt.v_page_count) / 16; virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + sizeof(struct vnode))); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); uma_prealloc(buf_trie_zone, nbuf); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); MNT_ILOCK(mp); MNT_REF(mp); /* * If mount point is currenly being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REL(mp); KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); mp->mnt_lockref--; if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; /* * If the thread is jailed, but this is not a jail-friendly file * system, deny immediately. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted outside the jail of the calling * thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || (!vlru_allow_cache_src && !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || (!vlru_allow_cache_src && !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); vdrop(vp); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); atomic_add_long(&recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp, 0); vdropl(vp); done++; next_iter_mntunlocked: if (!should_yield()) goto relock_mnt; goto yield; next_iter: if (!should_yield()) continue; MNT_IUNLOCK(mp); yield: kern_yield(PRI_USER); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); KASSERT((vp->v_iflag & VI_FREE) != 0, ("Removing vnode not on freelist")); KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Mangling active vnode")); TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); continue; } VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, vp, ("vp inconsistent on freelist")); /* * The clear of VI_FREE prevents activation of the * vnode. There is no sense in putting the vnode on * the mount point active list, only to remove it * later during recycling. Inline the relevant part * of vholdl(), to avoid triggering assertions or * activating. */ freevnodes--; vp->v_iflag &= ~VI_FREE; vp->v_holdcnt++; mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vtryrecycle(vp); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done; struct proc *p = vnlruproc; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); for (;;) { kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } done += vlrureclaim(mp); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (done == 0) { #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else kern_yield(PRI_USER); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) { atomic_add_long(&recycles_count, 1); vgonel(vp); } VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); return (0); } /* * Wait for available vnodes. */ static int getnewvnode_wait(int suspended) { mtx_assert(&vnode_free_list_mtx, MA_OWNED); if (numvnodes > desiredvnodes) { if (suspended) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); return (0); } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); } return (numvnodes > desiredvnodes ? ENFILE : 0); } void getnewvnode_reserve(u_int count) { struct thread *td; td = curthread; /* First try to be quick and racy. */ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { td->td_vp_reserv += count; return; } else atomic_subtract_long(&numvnodes, count); mtx_lock(&vnode_free_list_mtx); while (count > 0) { if (getnewvnode_wait(0) == 0) { count--; td->td_vp_reserv++; atomic_add_long(&numvnodes, 1); } } mtx_unlock(&vnode_free_list_mtx); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; atomic_subtract_long(&numvnodes, td->td_vp_reserv); td->td_vp_reserv = 0; } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct bufobj *bo; struct thread *td; int error; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); vp = NULL; td = curthread; if (td->td_vp_reserv > 0) { td->td_vp_reserv -= 1; goto alloc; } mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (error != 0) { mtx_unlock(&vnode_free_list_mtx); return (error); } #endif atomic_add_long(&numvnodes, 1); mtx_unlock(&vnode_free_list_mtx); alloc: atomic_add_long(&vnodes_created, 1); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = NULL; #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); else if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } rangelock_init(&vp->v_rl); /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; int active; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, ("Active vnode list size %d > Vnode list size %d", mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); active = vp->v_iflag & VI_ACTIVE; vp->v_iflag &= ~VI_ACTIVE; if (active) { mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; mtx_unlock(&vnode_free_list_mtx); } vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag |= VI_ACTIVE; mtx_lock(&vnode_free_list_mtx); TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_WUNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, vp, cred, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: bo = &vp->v_bufobj; BO_LOCK(bo); anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { BO_UNLOCK(bo); goto restart; } } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { BO_UNLOCK(bo); goto restart; } } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = (*bo)->__bo_vnode; /* XXX */ if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) wdog_kern_pat(WD_LASTVAL); } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vholdl(vp); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. */ static void v_upgrade_usecount(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VI_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. */ int vget(struct vnode *vp, int flags, struct thread *td) { int error; error = 0; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vget: invalid lock operation")); CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); vholdl(vp); if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { vdrop(vp); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); VI_LOCK(vp); /* Upgrade our holdcnt to a usecount. */ v_upgrade_usecount(vp); /* * We don't guarantee that any particular close will * trigger inactive processing so just make a best effort * here at preventing a reference to a removed file. If * we don't succeed no harm is done. */ if (vp->v_iflag & VI_OWEINACT) { if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && (flags & LK_NOWAIT) == 0) vinactive(vp, td); vp->v_iflag &= ~VI_OWEINACT; } VI_UNLOCK(vp); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } #define VPUTX_VRELE 1 #define VPUTX_VPUT 2 #define VPUTX_VUNREF 3 static void vputx(struct vnode *vp, int func) { int error; KASSERT(vp != NULL, ("vputx: null vp")); if (func == VPUTX_VUNREF) ASSERT_VOP_LOCKED(vp, "vunref"); else if (func == VPUTX_VPUT) ASSERT_VOP_LOCKED(vp, "vput"); else KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vputx: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { if (func == VPUTX_VPUT) VOP_UNLOCK(vp, 0); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { vprint("vputx: negative ref count", vp); panic("vputx: negative ref cnt"); } CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; switch (func) { case VPUTX_VRELE: error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; case VPUTX_VPUT: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VPUTX_VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (error == 0) { if (vp->v_iflag & VI_OWEINACT) vinactive(vp, curthread); if (func != VPUTX_VUNREF) VOP_UNLOCK(vp, 0); } vdropl(vp); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { vputx(vp, VPUTX_VRELE); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) */ void vput(struct vnode *vp) { vputx(vp, VPUTX_VPUT); } /* * Release an exclusively locked vnode. Do not unlock the vnode lock. */ void vunref(struct vnode *vp) { vputx(vp, VPUTX_VUNREF); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } /* * Increase the hold count and activate if this is the first reference. */ void vholdl(struct vnode *vp) { struct mount *mp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS /* getnewvnode() calls v_incr_usecount() without holding interlock. */ if (vp->v_type != VNON || vp->v_data != NULL) { ASSERT_VI_LOCKED(vp, "vholdl"); VNASSERT(vp->v_holdcnt > 0 || (vp->v_iflag & VI_FREE) != 0, vp, ("vholdl: free vnode is held")); } #endif vp->v_holdcnt++; if ((vp->v_iflag & VI_FREE) == 0) return; VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count")); VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed.")); /* * Remove a vnode from the free list, mark it as in use, * and put it on the active list. */ mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag |= VI_ACTIVE; mp = vp->v_mount; TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; mtx_unlock(&vnode_free_list_mtx); } /* * Note that there is one less who cares about this vnode. * vdrop() is the opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VI_DOOMED) in which case we will free it. */ void vdropl(struct vnode *vp) { struct bufobj *bo; struct mount *mp; int active; ASSERT_VI_LOCKED(vp, "vdropl"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; if (vp->v_holdcnt > 0) { VI_UNLOCK(vp); return; } if ((vp->v_iflag & VI_DOOMED) == 0) { /* * Mark a vnode as free: remove it from its active list * and put it up for recycling on the freelist. */ VNASSERT(vp->v_op != NULL, vp, ("vdropl: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(vp->v_holdcnt == 0, vp, ("vdropl: freeing when we shouldn't")); active = vp->v_iflag & VI_ACTIVE; vp->v_iflag &= ~VI_ACTIVE; mp = vp->v_mount; mtx_lock(&vnode_free_list_mtx); if (active) { TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; } if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); return; } /* * The vnode has been marked for destruction, so free it. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); atomic_subtract_long(&numvnodes, 1); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) destroy_vpollinfo(vp->v_pollinfo); #ifdef INVARIANTS /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); rw_destroy(BO_LOCKPTR(bo)); uma_zfree(vnode_zone, vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ void vinactive(struct vnode *vp, struct thread *td) { struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are on the vnode's dirty list since these will * no longer be checked once the vnode is on the inactive list. * Because the vnode vm object keeps a hold reference on the vnode * if there is at least one resident non-cached page, the vnode * cannot leave the active list without the page cleanup done. */ obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); VM_OBJECT_WUNLOCK(obj); } VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); if (error != 0) { VOP_UNLOCK(vp, 0); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || vp->v_op != &devfs_specops || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp, 0); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, "vrecycle"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } static void notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); if (TAILQ_EMPTY(&mp->mnt_uppers)) goto unlock; MNT_IUNLOCK(mp); mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); mmp->mnt_op = &vgonel_vfsops; mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_VGONE_UPPER; for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } free(mmp, M_TEMP); mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; wakeup(&mp->mnt_uppers); } unlock: MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; struct mount *mp; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } #ifdef INVARIANTS BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); #endif /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_TEXT) strlcat(buf, "|VV_TEXT", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_AGE) strlcat(buf, "|VI_AGE", sizeof(buf)); if (vp->v_iflag & VI_DOOMED) strlcat(buf, "|VI_DOOMED", sizeof(buf)); if (vp->v_iflag & VI_FREE) strlcat(buf, "|VI_FREE", sizeof(buf)); if (vp->v_iflag & VI_ACTIVE) strlcat(buf, "|VI_ACTIVE", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_dirty.bv_cnt, vp->v_bufobj.bo_clean.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vprint("", vp); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOINSMNTQ); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_VGONE_UPPER); MNT_KERN_FLAG(MNTK_VGONE_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); + MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); MNT_KERN_FLAG(MNTK_NOKNOTE); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d\n", mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { if (vp->v_type != VMARKER) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; xvfsp.vfc_vfsops = 0; xvfsp.vfc_next = 0; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp; struct thread *td; int error; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_WUNLOCK(obj); } vput(vp); } } else VI_UNLOCK(vp); } } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp, 0); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } save = curthread_pflags_set(TDP_SYNCIO); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; error = 0; dev_lock(); if (vp->v_type != VCHR) error = ENOTBLK; else if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (!IGNORE_LOCK(vp)) { locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (panicstr == NULL && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0); } static void vfs_knl_assert_locked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); #endif } static void vfs_knl_assert_unlocked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_data != 0); VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VOP_MARKATIME. This functionality is used by execve and * mmap, so we want to avoid the I/O implied by directly setting * va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct ucred *cred) { struct mount *mp; mp = vp->v_mount; ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) (void)VOP_MARKATIME(vp); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); vp = TAILQ_NEXT(*mvp, v_nmntvnodes); while (vp != NULL && (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); VI_LOCK(vp); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); (*mvp)->v_type = VMARKER; vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; return (NULL); } (*mvp)->v_mount = mp; TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); VI_LOCK(vp); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h */ static void mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; } static struct vnode * mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) { struct vnode *vp, *nvp; mtx_assert(&vnode_free_list_mtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_actfreelist); TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_actfreelist); continue; } if (!VI_TRYLOCK(vp)) { if (mp_ncpus == 1 || should_yield()) { TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); pause("vnacti", 1); mtx_lock(&vnode_free_list_mtx); goto restart; } continue; } KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the active list %p %p", vp, mp)); if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) break; nvp = TAILQ_NEXT(vp, v_actfreelist); VI_UNLOCK(vp); vp = nvp; } /* Check if we are done */ if (vp == NULL) { mtx_unlock(&vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); ASSERT_VI_LOCKED(vp, "active iter"); KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); return (vp); } struct vnode * __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_free_list_mtx); return (mnt_vnode_next_active(mvp, mp)); } struct vnode * __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); (*mvp)->v_type = VMARKER; (*mvp)->v_mount = mp; mtx_lock(&vnode_free_list_mtx); vp = TAILQ_FIRST(&mp->mnt_activevnodelist); if (vp == NULL) { mtx_unlock(&vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); return (mnt_vnode_next_active(mvp, mp)); } void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); } Index: stable/10/sys/sys/mount.h =================================================================== --- stable/10/sys/sys/mount.h (revision 282269) +++ stable/10/sys/sys/mount.h (revision 282270) @@ -1,951 +1,952 @@ /*- * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 * $FreeBSD$ */ #ifndef _SYS_MOUNT_H_ #define _SYS_MOUNT_H_ #include #include #ifdef _KERNEL #include #include #include #include #endif /* * NOTE: When changing statfs structure, mount structure, MNT_* flags or * MNTK_* flags also update DDB show mount command in vfs_subr.c. */ typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ /* * File identifier. * These are unique per filesystem on a single machine. */ #define MAXFIDSZ 16 struct fid { u_short fid_len; /* length of data in bytes */ u_short fid_data0; /* force longword alignment */ char fid_data[MAXFIDSZ]; /* data (variable length) */ }; /* * filesystem statistics */ #define MFSNAMELEN 16 /* length of type name including null */ #define MNAMELEN 88 /* size of on/from name bufs */ #define STATFS_VERSION 0x20030518 /* current version number */ struct statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[MFSNAMELEN]; /* filesystem type name */ char f_mntfromname[MNAMELEN]; /* mounted filesystem */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ }; #ifdef _KERNEL #define OMFSNAMELEN 16 /* length of fs type name, including null */ #define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ /* XXX getfsstat.2 is out of date with write and read counter changes here. */ /* XXX statfs.2 is out of date with read counter changes here. */ struct ostatfs { long f_spare2; /* placeholder */ long f_bsize; /* fundamental filesystem block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in filesystem */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in filesystem */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* filesystem id */ uid_t f_owner; /* user that mounted the filesystem */ int f_type; /* type of filesystem */ int f_flags; /* copy of mount exported flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ char f_fstypename[OMFSNAMELEN]; /* fs type name */ char f_mntonname[OMNAMELEN]; /* directory on which mounted */ long f_syncreads; /* count of sync reads since mount */ long f_asyncreads; /* count of async reads since mount */ short f_spares1; /* unused spare */ char f_mntfromname[OMNAMELEN];/* mounted filesystem */ short f_spares2; /* unused spare */ /* * XXX on machines where longs are aligned to 8-byte boundaries, there * is an unnamed int32_t here. This spare was after the apparent end * of the struct until we bit off the read counters from f_mntonname. */ long f_spare[2]; /* unused spare */ }; TAILQ_HEAD(vnodelst, vnode); /* Mount options list */ TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; int pos; int seen; }; /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are * put on a doubly linked list. * * Lock reference: * m - mountlist_mtx * i - interlock * v - vnode freelist mutex * * Unmarked fields are considered stable as long as a ref is held. * */ struct mount { struct mtx mnt_mtx; /* mount structure interlock */ int mnt_gen; /* struct mount generation */ #define mnt_startzero mnt_list TAILQ_ENTRY(mount) mnt_list; /* (m) mount list */ struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ struct vnode *mnt_vnodecovered; /* vnode we mounted on */ struct vnode *mnt_syncer; /* syncer vnode */ int mnt_ref; /* (i) Reference count */ struct vnodelst mnt_nvnodelist; /* (i) list of vnodes */ int mnt_nvnodelistsize; /* (i) # of vnodes */ struct vnodelst mnt_activevnodelist; /* (v) list of active vnodes */ int mnt_activevnodelistsize;/* (v) # of active vnodes */ int mnt_writeopcount; /* (i) write syscalls pending */ int mnt_kern_flag; /* (i) kernel only flags */ uint64_t mnt_flag; /* (i) flags shared with user */ struct vfsoptlist *mnt_opt; /* current mount options */ struct vfsoptlist *mnt_optnew; /* new options passed to fs */ int mnt_maxsymlinklen; /* max size of short symlink */ struct statfs mnt_stat; /* cache of filesystem stats */ struct ucred *mnt_cred; /* credentials of mounter */ void * mnt_data; /* private data */ time_t mnt_time; /* last time written*/ int mnt_iosize_max; /* max size for clusters, etc */ struct netexport *mnt_export; /* export list */ struct label *mnt_label; /* MAC label for the fs */ u_int mnt_hashseed; /* Random seed for vfs_hash */ int mnt_lockref; /* (i) Lock reference count */ int mnt_secondary_writes; /* (i) # of secondary writes */ int mnt_secondary_accwrites;/* (i) secondary wr. starts */ struct thread *mnt_susp_owner; /* (i) thread owning suspension */ #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ }; /* * Definitions for MNT_VNODE_FOREACH_ALL. */ struct vnode *__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp); struct vnode *__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp); void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp); #define MNT_VNODE_FOREACH_ALL(vp, mp, mvp) \ for (vp = __mnt_vnode_first_all(&(mvp), (mp)); \ (vp) != NULL; vp = __mnt_vnode_next_all(&(mvp), (mp))) #define MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp) \ do { \ MNT_ILOCK(mp); \ __mnt_vnode_markerfree_all(&(mvp), (mp)); \ /* MNT_IUNLOCK(mp); -- done in above function */ \ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); \ } while (0) /* * Definitions for MNT_VNODE_FOREACH_ACTIVE. */ struct vnode *__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp); struct vnode *__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp); void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) \ for (vp = __mnt_vnode_first_active(&(mvp), (mp)); \ (vp) != NULL; vp = __mnt_vnode_next_active(&(mvp), (mp))) #define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp) \ __mnt_vnode_markerfree_active(&(mvp), (mp)) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) #define MNT_REF(mp) (mp)->mnt_ref++ #define MNT_REL(mp) do { \ KASSERT((mp)->mnt_ref > 0, ("negative mnt_ref")); \ (mp)->mnt_ref--; \ if ((mp)->mnt_ref == 0) \ wakeup((mp)); \ } while (0) #endif /* _KERNEL */ /* * User specifiable flags, stored in mnt_flag. */ #define MNT_RDONLY 0x0000000000000001ULL /* read only filesystem */ #define MNT_SYNCHRONOUS 0x0000000000000002ULL /* fs written synchronously */ #define MNT_NOEXEC 0x0000000000000004ULL /* can't exec from filesystem */ #define MNT_NOSUID 0x0000000000000008ULL /* don't honor setuid fs bits */ #define MNT_NFS4ACLS 0x0000000000000010ULL /* enable NFS version 4 ACLs */ #define MNT_UNION 0x0000000000000020ULL /* union with underlying fs */ #define MNT_ASYNC 0x0000000000000040ULL /* fs written asynchronously */ #define MNT_SUIDDIR 0x0000000000100000ULL /* special SUID dir handling */ #define MNT_SOFTDEP 0x0000000000200000ULL /* using soft updates */ #define MNT_NOSYMFOLLOW 0x0000000000400000ULL /* do not follow symlinks */ #define MNT_GJOURNAL 0x0000000002000000ULL /* GEOM journal support enabled */ #define MNT_MULTILABEL 0x0000000004000000ULL /* MAC support for objects */ #define MNT_ACLS 0x0000000008000000ULL /* ACL support enabled */ #define MNT_NOATIME 0x0000000010000000ULL /* dont update file access time */ #define MNT_NOCLUSTERR 0x0000000040000000ULL /* disable cluster read */ #define MNT_NOCLUSTERW 0x0000000080000000ULL /* disable cluster write */ #define MNT_SUJ 0x0000000100000000ULL /* using journaled soft updates */ #define MNT_AUTOMOUNTED 0x0000000200000000ULL /* mounted by automountd(8) */ /* * NFS export related mount flags. */ #define MNT_EXRDONLY 0x0000000000000080ULL /* exported read only */ #define MNT_EXPORTED 0x0000000000000100ULL /* filesystem is exported */ #define MNT_DEFEXPORTED 0x0000000000000200ULL /* exported to the world */ #define MNT_EXPORTANON 0x0000000000000400ULL /* anon uid mapping for all */ #define MNT_EXKERB 0x0000000000000800ULL /* exported with Kerberos */ #define MNT_EXPUBLIC 0x0000000020000000ULL /* public export (WebNFS) */ /* * Flags set by internal operations, * but visible to the user. * XXX some of these are not quite right.. (I've never seen the root flag set) */ #define MNT_LOCAL 0x0000000000001000ULL /* filesystem is stored locally */ #define MNT_QUOTA 0x0000000000002000ULL /* quotas are enabled on fs */ #define MNT_ROOTFS 0x0000000000004000ULL /* identifies the root fs */ #define MNT_USER 0x0000000000008000ULL /* mounted by a user */ #define MNT_IGNORE 0x0000000000800000ULL /* do not show entry in df */ /* * Mask of flags that are visible to statfs(). * XXX I think that this could now become (~(MNT_CMDFLAGS)) * but the 'mount' program may need changing to handle this. */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_UNION | MNT_SUJ | \ MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_USER | MNT_QUOTA | \ MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \ MNT_NFS4ACLS | MNT_AUTOMOUNTED) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ MNT_NOATIME | \ MNT_NOSYMFOLLOW | MNT_IGNORE | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS | MNT_USER | MNT_NFS4ACLS | \ MNT_AUTOMOUNTED) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. * XXX: These are not STATES and really should be somewhere else. * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for * mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless. */ #define MNT_UPDATE 0x0000000000010000ULL /* not real mount, just update */ #define MNT_DELEXPORT 0x0000000000020000ULL /* delete export host lists */ #define MNT_RELOAD 0x0000000000040000ULL /* reload filesystem data */ #define MNT_FORCE 0x0000000000080000ULL /* force unmount or readonly */ #define MNT_SNAPSHOT 0x0000000001000000ULL /* snapshot the filesystem */ #define MNT_BYFSID 0x0000000008000000ULL /* specify filesystem by ID. */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_BYFSID) /* * Internal filesystem control flags stored in mnt_kern_flag. * * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed * past the mount point. This keeps the subtree stable during mounts * and unmounts. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). * * MNTK_NOINSMNTQ is strict subset of MNTK_UNMOUNT. They are separated * to allow for failed unmount attempt to restore the syncer vnode for * the mount. */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_ASYNC 0x00000002 /* filtered async flag */ #define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */ #define MNTK_NOINSMNTQ 0x00000008 /* insmntque is not allowed */ #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */ #define MNTK_SHARED_WRITES 0x00000080 /* Allow shared locking for writes */ #define MNTK_NO_IOPF 0x00000100 /* Disallow page faults during reads and writes. Filesystem shall properly handle i/o state on EFAULT. */ #define MNTK_VGONE_UPPER 0x00000200 #define MNTK_VGONE_WAITER 0x00000400 #define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 #define MNTK_MARKER 0x00001000 #define MNTK_UNMAPPED_BUFS 0x00002000 +#define MNTK_USES_BCACHE 0x00004000 /* FS uses the buffer cache. */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPEND2 0x04000000 /* block secondary writes */ #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ #define MNTK_UNUSED25 0x20000000 /* --available-- */ #define MNTK_LOOKUP_SHARED 0x40000000 /* FS supports shared lock lookups */ #define MNTK_NOKNOTE 0x80000000 /* Don't send KNOTEs from VOP hooks */ #ifdef _KERNEL static inline int MNT_SHARED_WRITES(struct mount *mp) { return (mp != NULL && (mp->mnt_kern_flag & MNTK_SHARED_WRITES) != 0); } static inline int MNT_EXTENDED_SHARED(struct mount *mp) { return (mp != NULL && (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED) != 0); } #endif /* * Sysctl CTL_VFS definitions. * * Second level identifier specifies which filesystem. Second level * identifier VFS_VFSCONF returns information about all filesystems. * Second level identifier VFS_GENERIC is non-terminal. */ #define VFS_VFSCONF 0 /* get configured filesystems */ #define VFS_GENERIC 0 /* generic filesystem information */ /* * Third level identifiers for VFS_GENERIC are given below; third * level identifiers for specific filesystems are given in their * mount specific header files. */ #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ /* * Flags for various system call interfaces. * * waitfor flags to vfs_sync() and getfsstat() */ #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_LAZY 3 /* push data not written by filesystem syncer */ #define MNT_SUSPEND 4 /* Suspend file system after sync */ /* * Generic file handle */ struct fhandle { fsid_t fh_fsid; /* Filesystem id of mount point */ struct fid fh_fid; /* Filesys specific id */ }; typedef struct fhandle fhandle_t; /* * Old export arguments without security flavor list */ struct oexport_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ }; /* * Export arguments for local filesystem mount calls. */ #define MAXSECFLAVORS 5 struct export_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ int ex_numsecflavors; /* security flavor count */ int ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */ }; /* * Structure holding information for a publicly exported filesystem * (WebNFS). Currently the specs allow just for one such filesystem. */ struct nfs_public { int np_valid; /* Do we hold valid information */ fhandle_t np_handle; /* Filehandle for pub fs (internal) */ struct mount *np_mount; /* Mountpoint of exported fs */ char *np_index; /* Index file */ }; /* * Filesystem configuration information. One of these exists for each * type of filesystem supported by the kernel. These are searched at * mount time to identify the requested filesystem. * * XXX: Never change the first two arguments! */ struct vfsconf { u_int vfc_version; /* ABI version number */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ struct vfsops *vfc_vfsops; /* filesystem operations vector */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsoptdecl *vfc_opts; /* mount options */ TAILQ_ENTRY(vfsconf) vfc_list; /* list of vfscons */ }; /* Userland version of the struct vfsconf. */ struct xvfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsconf *vfc_next; /* next in list */ }; #ifndef BURN_BRIDGES struct ovfsconf { void *vfc_vfsops; char vfc_name[32]; int vfc_index; int vfc_refcount; int vfc_flags; }; #endif /* * NB: these flags refer to IMPLEMENTATION properties, not properties of * any actual mounts; i.e., it does not make sense to change the flags. */ #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ #define VFCF_NETWORK 0x00020000 /* may get data over the network */ #define VFCF_READONLY 0x00040000 /* writes are not implemented */ #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files */ #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode */ #define VFCF_JAIL 0x00400000 /* can be mounted from within a jail */ #define VFCF_DELEGADMIN 0x00800000 /* supports delegated administration */ #define VFCF_SBDRY 0x01000000 /* defer stop requests */ typedef uint32_t fsctlop_t; struct vfsidctl { int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ fsid_t vc_fsid; /* fsid to operate on */ char vc_fstypename[MFSNAMELEN]; /* type of fs 'nfs' or '*' */ fsctlop_t vc_op; /* operation VFS_CTL_* (below) */ void *vc_ptr; /* pointer to data structure */ size_t vc_len; /* sizeof said structure */ u_int32_t vc_spare[12]; /* spare (must be zero) */ }; /* vfsidctl API version. */ #define VFS_CTL_VERS1 0x01 /* * New style VFS sysctls, do not reuse/conflict with the namespace for * private sysctls. * All "global" sysctl ops have the 33rd bit set: * 0x...1.... * Private sysctl ops should have the 33rd bit unset. */ #define VFS_CTL_QUERY 0x00010001 /* anything wrong? (vfsquery) */ #define VFS_CTL_TIMEO 0x00010002 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010003 /* disable file locking */ struct vfsquery { u_int32_t vq_flags; u_int32_t vq_spare[31]; }; /* vfsquery flags */ #define VQ_NOTRESP 0x0001 /* server down */ #define VQ_NEEDAUTH 0x0002 /* server bad auth */ #define VQ_LOWDISK 0x0004 /* we're low on space */ #define VQ_MOUNT 0x0008 /* new filesystem arrived */ #define VQ_UNMOUNT 0x0010 /* filesystem has left */ #define VQ_DEAD 0x0020 /* filesystem is dead, needs force unmount */ #define VQ_ASSIST 0x0040 /* filesystem needs assistance from external program */ #define VQ_NOTRESPLOCK 0x0080 /* server lockd down */ #define VQ_FLAG0100 0x0100 /* placeholder */ #define VQ_FLAG0200 0x0200 /* placeholder */ #define VQ_FLAG0400 0x0400 /* placeholder */ #define VQ_FLAG0800 0x0800 /* placeholder */ #define VQ_FLAG1000 0x1000 /* placeholder */ #define VQ_FLAG2000 0x2000 /* placeholder */ #define VQ_FLAG4000 0x4000 /* placeholder */ #define VQ_FLAG8000 0x8000 /* placeholder */ #ifdef _KERNEL /* Point a sysctl request at a vfsidctl's data. */ #define VCTLTOREQ(vc, req) \ do { \ (req)->newptr = (vc)->vc_ptr; \ (req)->newlen = (vc)->vc_len; \ (req)->newidx = 0; \ } while (0) #endif struct iovec; struct uio; #ifdef _KERNEL /* * vfs_busy specific flags and mask. */ #define MBF_NOWAIT 0x01 #define MBF_MNTLSTLOCK 0x02 #define MBF_MASK (MBF_NOWAIT | MBF_MNTLSTLOCK) #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MOUNT); #endif extern int maxvfsconf; /* highest defined filesystem type */ extern int nfs_mount_type; /* vfc_typenum for nfs, or -1 */ TAILQ_HEAD(vfsconfhead, vfsconf); extern struct vfsconfhead vfsconf; /* * Operations supported on mounted filesystem. */ struct mount_args; struct nameidata; struct sysctl_req; struct mntarg; typedef int vfs_cmount_t(struct mntarg *ma, void *data, uint64_t flags); typedef int vfs_unmount_t(struct mount *mp, int mntflags); typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp); typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg); typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp); typedef int vfs_sync_t(struct mount *mp, int waitfor); typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp); typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); typedef int vfs_init_t(struct vfsconf *); typedef int vfs_uninit_t(struct vfsconf *); typedef int vfs_extattrctl_t(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname); typedef int vfs_mount_t(struct mount *mp); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp); typedef void vfs_purge_t(struct mount *mp); struct vfsops { vfs_mount_t *vfs_mount; vfs_cmount_t *vfs_cmount; vfs_unmount_t *vfs_unmount; vfs_root_t *vfs_root; vfs_quotactl_t *vfs_quotactl; vfs_statfs_t *vfs_statfs; vfs_sync_t *vfs_sync; vfs_vget_t *vfs_vget; vfs_fhtovp_t *vfs_fhtovp; vfs_checkexp_t *vfs_checkexp; vfs_init_t *vfs_init; vfs_uninit_t *vfs_uninit; vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; vfs_notify_lowervp_t *vfs_reclaim_lowervp; vfs_notify_lowervp_t *vfs_unlink_lowervp; vfs_purge_t *vfs_purge; vfs_mount_t *vfs_spare[6]; /* spares for ABI compat */ }; vfs_statfs_t __vfs_statfs; #define VFS_PROLOGUE(MP) do { \ struct mount *mp__; \ int _enable_stops; \ \ mp__ = (MP); \ _enable_stops = (mp__ != NULL && \ (mp__->mnt_vfc->vfc_flags & VFCF_SBDRY) && sigdeferstop()) #define VFS_EPILOGUE(MP) \ if (_enable_stops) \ sigallowstop(); \ } while (0) #define VFS_MOUNT(MP) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_mount)(MP); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_UNMOUNT(MP, FORCE) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_unmount)(MP, FORCE); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_ROOT(MP, FLAGS, VPP) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_QUOTACTL(MP, C, U, A) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_STATFS(MP, SBP) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = __vfs_statfs((MP), (SBP)); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_SYNC(MP, WAIT) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_sync)(MP, WAIT); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_VGET(MP, INO, FLAGS, VPP) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_FHTOVP(MP, FIDP, FLAGS, VPP) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, FLAGS, VPP); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED, NUMSEC, SEC) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED, NUMSEC,\ SEC); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_EXTATTRCTL(MP, C, FN, NS, N) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_SYSCTL(MP, OP, REQ) ({ \ int _rc; \ \ VFS_PROLOGUE(MP); \ _rc = (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ); \ VFS_EPILOGUE(MP); \ _rc; }) #define VFS_SUSP_CLEAN(MP) do { \ if (*(MP)->mnt_op->vfs_susp_clean != NULL) { \ VFS_PROLOGUE(MP); \ (*(MP)->mnt_op->vfs_susp_clean)(MP); \ VFS_EPILOGUE(MP); \ } \ } while (0) #define VFS_RECLAIM_LOWERVP(MP, VP) do { \ if (*(MP)->mnt_op->vfs_reclaim_lowervp != NULL) { \ VFS_PROLOGUE(MP); \ (*(MP)->mnt_op->vfs_reclaim_lowervp)((MP), (VP)); \ VFS_EPILOGUE(MP); \ } \ } while (0) #define VFS_UNLINK_LOWERVP(MP, VP) do { \ if (*(MP)->mnt_op->vfs_unlink_lowervp != NULL) { \ VFS_PROLOGUE(MP); \ (*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP)); \ VFS_EPILOGUE(MP); \ } \ } while (0) #define VFS_PURGE(MP) do { \ if (*(MP)->mnt_op->vfs_purge != NULL) { \ VFS_PROLOGUE(MP); \ (*(MP)->mnt_op->vfs_purge)(MP); \ VFS_EPILOGUE(MP); \ } \ } while (0) #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), KNF_LISTLOCKED); \ } while (0) #define VFS_KNOTE_UNLOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 0); \ } while (0) #define VFS_NOTIFY_UPPER_RECLAIM 1 #define VFS_NOTIFY_UPPER_UNLINK 2 #include /* * Version numbers. */ #define VFS_VERSION_00 0x19660120 #define VFS_VERSION_01 0x20121030 #define VFS_VERSION VFS_VERSION_01 #define VFS_SET(vfsops, fsname, flags) \ static struct vfsconf fsname ## _vfsconf = { \ .vfc_version = VFS_VERSION, \ .vfc_name = #fsname, \ .vfc_vfsops = &vfsops, \ .vfc_typenum = -1, \ .vfc_flags = flags, \ }; \ static moduledata_t fsname ## _mod = { \ #fsname, \ vfs_modevent, \ & fsname ## _vfsconf \ }; \ DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) extern char *mountrootfsname; /* * exported vnode operations */ int dounmount(struct mount *, int, struct thread *); int kernel_mount(struct mntarg *ma, uint64_t flags); int kernel_vmount(int flags, ...); struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len); struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name); struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...); struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len); void statfs_scale_blocks(struct statfs *sf, long max_size); struct vfsconf *vfs_byname(const char *); struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *); void vfs_mount_destroy(struct mount *); void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); void vfs_freeopts(struct vfsoptlist *opts); void vfs_deleteopt(struct vfsoptlist *opts, const char *name); int vfs_buildopts(struct uio *auio, struct vfsoptlist **options); int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val); int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); int vfs_getopt_pos(struct vfsoptlist *opts, const char *name); int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value); char *vfs_getopts(struct vfsoptlist *, const char *, int *error); int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); int vfs_filteropt(struct vfsoptlist *, const char **legal); void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...); int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...); int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len); int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len); int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); void vfs_msync(struct mount *, int); int vfs_busy(struct mount *, int); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); void vfs_allocate_syncvnode(struct mount *); void vfs_deallocate_syncvnode(struct mount *); int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions); void vfs_getnewfsid(struct mount *); struct cdev *vfs_getrootfsid(struct mount *); struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ struct mount *vfs_busyfs(fsid_t *); int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); void vfs_notify_upper(struct vnode *, int); void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp); void vfs_ref(struct mount *); void vfs_rel(struct mount *); struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *, struct ucred *); int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx mountlist_mtx; extern struct nfs_public nfs_pub; extern struct sx vfsconf_sx; #define vfsconf_lock() sx_xlock(&vfsconf_sx) #define vfsconf_unlock() sx_xunlock(&vfsconf_sx) #define vfsconf_slock() sx_slock(&vfsconf_sx) #define vfsconf_sunlock() sx_sunlock(&vfsconf_sx) /* * Declarations for these vfs default operations are located in * kern/vfs_default.c. They will be automatically used to replace * null entries in VFS ops tables when registering a new filesystem * type in the global table. */ vfs_root_t vfs_stdroot; vfs_quotactl_t vfs_stdquotactl; vfs_statfs_t vfs_stdstatfs; vfs_sync_t vfs_stdsync; vfs_sync_t vfs_stdnosync; vfs_vget_t vfs_stdvget; vfs_fhtovp_t vfs_stdfhtovp; vfs_checkexp_t vfs_stdcheckexp; vfs_init_t vfs_stdinit; vfs_uninit_t vfs_stduninit; vfs_extattrctl_t vfs_stdextattrctl; vfs_sysctl_t vfs_stdsysctl; void syncer_suspend(void); void syncer_resume(void); #else /* !_KERNEL */ #include struct stat; __BEGIN_DECLS int fhopen(const struct fhandle *, int); int fhstat(const struct fhandle *, struct stat *); int fhstatfs(const struct fhandle *, struct statfs *); int fstatfs(int, struct statfs *); int getfh(const char *, fhandle_t *); int getfsstat(struct statfs *, long, int); int getmntinfo(struct statfs **, int); int lgetfh(const char *, fhandle_t *); int mount(const char *, const char *, int, void *); int nmount(struct iovec *, unsigned int, int); int statfs(const char *, struct statfs *); int unmount(const char *, int); /* C library stuff */ int getvfsbyname(const char *, struct xvfsconf *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_MOUNT_H_ */ Index: stable/10/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- stable/10/sys/ufs/ffs/ffs_vfsops.c (revision 282269) +++ stable/10/sys/ufs/ffs/ffs_vfsops.c (revision 282270) @@ -1,2216 +1,2216 @@ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", NULL }; static int ffs_mount(struct mount *mp) { struct vnode *devvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; pid_t fsckpid = 0; int error, flags; uint64_t mntorflags; accmode_t accmode; struct nameidata ndp; char *fspec; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { /* * Once we have set the restricted PID, do not * persist "fsckpid" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "fsckpid"); vfs_deleteopt(mp->mnt_opt, "fsckpid"); if (mp->mnt_flag & MNT_UPDATE) { if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } /* Set to -1 if we are done */ if (fsckpid == 0) fsckpid = -1; } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fsckpid == -1 && ump->um_fsckpid > 0) { if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) return (error); DROP_GIANT(); g_topology_lock(); /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); ump->um_fsckpid = 0; } if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (MOUNTEDSOFTDEP(mp)) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vfs_write_resume(mp, 0); return (error); } if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); DROP_GIANT(); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); PICKUP_GIANT(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If we are running a checker, do not allow upgrade. */ if (ump->um_fsckpid > 0) { vfs_mount_error(mp, "Active checker, cannot upgrade to write"); return (EINVAL); } /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } VOP_UNLOCK(devvp, 0); fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } DROP_GIANT(); g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * If this is a request from fsck to clean up the filesystem, * then allow the specified pid to proceed. */ if (fsckpid > 0) { if (ump->um_fsckpid != 0) { vfs_mount_error(mp, "Active checker already running on %s", fs->fs_fsmnt); return (EINVAL); } KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); DROP_GIANT(); g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) { vfs_mount_error(mp, "Checker activation failed on %s", fs->fs_fsmnt); return (error); } ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp)) != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } if (fsckpid > 0) { KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; DROP_GIANT(); g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) { printf("WARNING: %s: Checker activation " "failed\n", fs->fs_fsmnt); } else { ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } } } vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; struct export_args exp; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); vfs_oexport_conv(&args.export, &exp); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &exp, sizeof(exp)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, struct thread *td, int force) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, size, error; int32_t *lp; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && force == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp, 0); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); free(fs->fs_csp, M_UFSMNT); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0); vrele(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0); vrele(vp); } return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; struct cdev *dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, size, ronly; int32_t *lp; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; bp = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; dev = devvp->v_rdev; dev_ref(dev); DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0); if (error) goto out; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; devvp->v_bufobj.bo_ops = &ffs_ops; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indicies */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck.", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ size = 1024; mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &size, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { size = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &size, &ump->um_candelete) == 0) { if (!ump->um_candelete) printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); ump->um_candelete = 0; } } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (devvp->v_type == VCHR && devvp->v_rdev != NULL) devvp->v_rdev->si_mountpt = mp; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem stat information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | - MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS; + MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; } dev_rel(dev); return (error); } #include static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; susp = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; susp = fs->fs_ronly == 0; } #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && error != ENXIO) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (error && error != ENXIO) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); DROP_GIANT(); g_topology_lock(); if (ump->um_fsckpid > 0) { /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); ump->um_fsckpid = 0; } g_vfs_close(ump->um_cp); g_topology_unlock(); PICKUP_GIANT(); if (ump->um_devvp->v_type == VCHR && ump->um_devvp->v_rdev != NULL) ump->um_devvp->v_rdev->si_mountpt = NULL; vrele(ump->um_devvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = NAME_MAX; return (0); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(mp) struct mount *mp; { struct vnode *mvp, *vp; struct inode *ip; struct thread *td; int allerror, error; allerror = 0; td = curthread; if ((mp->mnt_flag & MNT_NOATIME) != 0) goto qupdate; MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td)) != 0) continue; error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } qupdate: #ifdef QUOTA qsync(mp); #endif if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, wait, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; wait = 0; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) { wait = 1; lockreq = LK_EXCLUSIVE; } lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq, td)) != 0) { if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) allerror = error; vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp, 0); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(mp, ino, flags, vpp, ffs_flags) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int ffs_flags; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; struct cdev *dev; int error; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); dev = ump->um_dev; fs = ump->um_fs; ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if (fs->fs_magic == FS_UFS1_MAGIC) error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp); else error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_fs = fs; ip->i_dev = dev; ip->i_number = ino; ip->i_ea_refs = 0; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (ip->i_ump->um_fstype == UFS1) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if (ip->i_ump->um_fstype == UFS1) error = ufs_vinit(mp, &ffs_fifoops1, &vp); else error = ufs_vinit(mp, &ffs_fifoops2, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = arc4random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_fhtovp(mp, ufhp, flags, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(ump, waitfor, suspended) struct ufsmount *ump; int waitfor; int suspended; { struct fs *fs = ump->um_fs; struct buf *sbbp; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(ump->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) { brelse(sbbp); return (allerror); } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } fs->fs_fmod = 0; fs->fs_time = time_second; if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, ump); if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ pbrelvp(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; int oldflags; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT((bp->b_flags & B_UNMAPPED) == 0, ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); newbp->b_xflags |= BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. */ bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; int error; struct buf *tbp; int nocopy; vp = bo->__bo_vnode; if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif } g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ Index: stable/10 =================================================================== --- stable/10 (revision 282269) +++ stable/10 (revision 282270) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r281562