Index: head/sys/fs/cd9660/cd9660_vnops.c =================================================================== --- head/sys/fs/cd9660/cd9660_vnops.c (revision 154151) +++ head/sys/fs/cd9660/cd9660_vnops.c (revision 154152) @@ -1,832 +1,833 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vnops.c 8.19 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static vop_setattr_t cd9660_setattr; static vop_open_t cd9660_open; static vop_access_t cd9660_access; static vop_getattr_t cd9660_getattr; static vop_ioctl_t cd9660_ioctl; static vop_pathconf_t cd9660_pathconf; static vop_read_t cd9660_read; struct isoreaddir; static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off); static int iso_shipdir(struct isoreaddir *idp); static vop_readdir_t cd9660_readdir; static vop_readlink_t cd9660_readlink; static vop_strategy_t cd9660_strategy; /* * Setattr call. Only allowed for block and character special devices. */ static int cd9660_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) return (EROFS); if (vap->va_size != (u_quad_t)VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: return (EROFS); case VCHR: case VBLK: case VSOCK: case VFIFO: case VNON: case VBAD: + case VMARKER: return (0); } } return (0); } /* * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. * The mode is shifted to select the owner/group/other fields. The * super user is granted all permissions. */ /* ARGSUSED */ static int cd9660_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); mode_t mode = ap->a_mode; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); /* * Disallow write attempts unless the file is a socket, * fifo, or a block or character device resident on the * filesystem. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: return (EROFS); /* NOT REACHED */ default: break; } } return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid, ip->inode.iso_gid, ap->a_mode, ap->a_cred, NULL)); } static int cd9660_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; } */ *ap; { struct iso_node *ip = VTOI(ap->a_vp); vnode_create_vobject(ap->a_vp, ip->i_size, ap->a_td); return 0; } static int cd9660_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct iso_node *ip = VTOI(vp); vap->va_fsid = dev2udev(ip->i_mnt->im_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->inode.iso_mode; vap->va_nlink = ip->inode.iso_links; vap->va_uid = ip->inode.iso_uid; vap->va_gid = ip->inode.iso_gid; vap->va_atime = ip->inode.iso_atime; vap->va_mtime = ip->inode.iso_mtime; vap->va_ctime = ip->inode.iso_ctime; vap->va_rdev = ip->inode.iso_rdev; vap->va_size = (u_quad_t) ip->i_size; if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) { struct vop_readlink_args rdlnk; struct iovec aiov; struct uio auio; char *cp; MALLOC(cp, char *, MAXPATHLEN, M_TEMP, M_WAITOK); aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = ap->a_td; auio.uio_resid = MAXPATHLEN; rdlnk.a_uio = &auio; rdlnk.a_vp = ap->a_vp; rdlnk.a_cred = ap->a_cred; if (cd9660_readlink(&rdlnk) == 0) vap->va_size = MAXPATHLEN - auio.uio_resid; FREE(cp, M_TEMP); } vap->va_flags = 0; vap->va_gen = 1; vap->va_blocksize = ip->i_mnt->logical_block_size; vap->va_bytes = (u_quad_t) ip->i_size; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Vnode op for ioctl. */ static int cd9660_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); switch (ap->a_command) { case FIOGETLBA: *(int *)(ap->a_data) = ip->iso_start; return 0; default: return (ENOTTY); } } /* * Vnode op for reading. */ static int cd9660_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct iso_node *ip = VTOI(vp); struct iso_mnt *imp; struct buf *bp; daddr_t lbn, rablock; off_t diff; int rasize, error = 0; int seqcount; long size, n, on; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); seqcount = ap->a_ioflag >> IO_SEQSHIFT; if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); ip->i_flag |= IN_ACCESS; imp = ip->i_mnt; do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); n = min((u_int)(imp->logical_block_size - on), uio->uio_resid); diff = (off_t)ip->i_size - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = blksize(imp, ip, lbn); rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, (ap->a_ioflag >> 16), &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { if (seqcount > 1 && lblktosize(imp, rablock) < ip->i_size) { rasize = blksize(imp, ip, rablock); error = breadn(vp, lbn, size, &rablock, &rasize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); } n = min(n, size - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); } /* * Structure for reading directories */ struct isoreaddir { struct dirent saveent; struct dirent assocent; struct dirent current; off_t saveoff; off_t assocoff; off_t curroff; struct uio *uio; off_t uio_off; int eofflag; u_long *cookies; int ncookies; }; static int iso_uiodir(idp,dp,off) struct isoreaddir *idp; struct dirent *dp; off_t off; { int error; dp->d_name[dp->d_namlen] = 0; dp->d_reclen = GENERIC_DIRSIZ(dp); if (idp->uio->uio_resid < dp->d_reclen) { idp->eofflag = 0; return (-1); } if (idp->cookies) { if (idp->ncookies <= 0) { idp->eofflag = 0; return (-1); } *idp->cookies++ = off; --idp->ncookies; } if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0) return (error); idp->uio_off = off; return (0); } static int iso_shipdir(idp) struct isoreaddir *idp; { struct dirent *dp; int cl, sl, assoc; int error; char *cname, *sname; cl = idp->current.d_namlen; cname = idp->current.d_name; assoc = (cl > 1) && (*cname == ASSOCCHAR); if (assoc) { cl--; cname++; } dp = &idp->saveent; sname = dp->d_name; if (!(sl = dp->d_namlen)) { dp = &idp->assocent; sname = dp->d_name + 1; sl = dp->d_namlen - 1; } if (sl > 0) { if (sl != cl || bcmp(sname,cname,sl)) { if (idp->assocent.d_namlen) { if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0) return (error); idp->assocent.d_namlen = 0; } if (idp->saveent.d_namlen) { if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0) return (error); idp->saveent.d_namlen = 0; } } } idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current); if (assoc) { idp->assocoff = idp->curroff; bcopy(&idp->current,&idp->assocent,idp->current.d_reclen); } else { idp->saveoff = idp->curroff; bcopy(&idp->current,&idp->saveent,idp->current.d_reclen); } return (0); } /* * Vnode op for readdir */ static int cd9660_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct uio *uio = ap->a_uio; struct isoreaddir *idp; struct vnode *vdp = ap->a_vp; struct iso_node *dp; struct iso_mnt *imp; struct buf *bp = NULL; struct iso_directory_record *ep; int entryoffsetinblock; doff_t endsearch; u_long bmask; int error = 0; int reclen; u_short namelen; int ncookies = 0; u_long *cookies = NULL; dp = VTOI(vdp); imp = dp->i_mnt; bmask = imp->im_bmask; MALLOC(idp, struct isoreaddir *, sizeof(*idp), M_TEMP, M_WAITOK); idp->saveent.d_namlen = idp->assocent.d_namlen = 0; /* * XXX * Is it worth trying to figure out the type? */ idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type = DT_UNKNOWN; idp->uio = uio; if (ap->a_ncookies == NULL) { idp->cookies = NULL; } else { /* * Guess the number of cookies needed. */ ncookies = uio->uio_resid / 16; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); idp->cookies = cookies; idp->ncookies = ncookies; } idp->eofflag = 1; idp->curroff = uio->uio_offset; if ((entryoffsetinblock = idp->curroff & bmask) && (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) { FREE(idp, M_TEMP); return (error); } endsearch = dp->i_size; while (idp->curroff < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if ((idp->curroff & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0) break; entryoffsetinblock = 0; } /* * Get pointer to next entry. */ ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { /* skip to next block, if any */ idp->curroff = (idp->curroff & ~bmask) + imp->logical_block_size; continue; } if (reclen < ISO_DIRECTORY_RECORD_SIZE) { error = EINVAL; /* illegal entry, stop */ break; } if (entryoffsetinblock + reclen > imp->logical_block_size) { error = EINVAL; /* illegal directory, so stop looking */ break; } idp->current.d_namlen = isonum_711(ep->name_len); if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) { error = EINVAL; /* illegal entry, stop */ break; } if (isonum_711(ep->flags)&2) idp->current.d_fileno = isodirino(ep, imp); else idp->current.d_fileno = dbtob(bp->b_blkno) + entryoffsetinblock; idp->curroff += reclen; switch (imp->iso_ftype) { case ISO_FTYPE_RRIP: cd9660_rrip_getname(ep,idp->current.d_name, &namelen, &idp->current.d_fileno,imp); idp->current.d_namlen = (u_char)namelen; if (idp->current.d_namlen) error = iso_uiodir(idp,&idp->current,idp->curroff); break; default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/ strcpy(idp->current.d_name,".."); if (idp->current.d_namlen == 1 && ep->name[0] == 0) { idp->current.d_namlen = 1; error = iso_uiodir(idp,&idp->current,idp->curroff); } else if (idp->current.d_namlen == 1 && ep->name[0] == 1) { idp->current.d_namlen = 2; error = iso_uiodir(idp,&idp->current,idp->curroff); } else { isofntrans(ep->name,idp->current.d_namlen, idp->current.d_name, &namelen, imp->iso_ftype == ISO_FTYPE_9660, isonum_711(ep->flags)&4, imp->joliet_level, imp->im_flags, imp->im_d2l); idp->current.d_namlen = (u_char)namelen; if (imp->iso_ftype == ISO_FTYPE_DEFAULT) error = iso_shipdir(idp); else error = iso_uiodir(idp,&idp->current,idp->curroff); } } if (error) break; entryoffsetinblock += reclen; } if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) { idp->current.d_namlen = 0; error = iso_shipdir(idp); } if (error < 0) error = 0; if (ap->a_ncookies != NULL) { if (error) free(cookies, M_TEMP); else { /* * Work out the number of cookies actually used. */ *ap->a_ncookies = ncookies - idp->ncookies; *ap->a_cookies = cookies; } } if (bp) brelse (bp); uio->uio_offset = idp->uio_off; *ap->a_eofflag = idp->eofflag; FREE(idp, M_TEMP); return (error); } /* * Return target name of a symbolic link * Shouldn't we get the parent vnode and read the data from there? * This could eventually result in deadlocks in cd9660_lookup. * But otherwise the block read here is in the block buffer two times. */ typedef struct iso_directory_record ISODIR; typedef struct iso_node ISONODE; typedef struct iso_mnt ISOMNT; static int cd9660_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { ISONODE *ip; ISODIR *dirp; ISOMNT *imp; struct buf *bp; struct uio *uio; u_short symlen; int error; char *symname; ip = VTOI(ap->a_vp); imp = ip->i_mnt; uio = ap->a_uio; if (imp->iso_ftype != ISO_FTYPE_RRIP) return (EINVAL); /* * Get parents directory record block that this inode included. */ error = bread(imp->im_devvp, (ip->i_number >> imp->im_bshift) << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { brelse(bp); return (EINVAL); } /* * Setup the directory pointer for this inode */ dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask)); /* * Just make sure, we have a right one.... * 1: Check not cross boundary on block */ if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) > (unsigned)imp->logical_block_size) { brelse(bp); return (EINVAL); } /* * Now get a buffer * Abuse a namei buffer for now. */ if (uio->uio_segflg == UIO_SYSSPACE) symname = uio->uio_iov->iov_base; else symname = uma_zalloc(namei_zone, M_WAITOK); /* * Ok, we just gathering a symbolic name in SL record. */ if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) { if (uio->uio_segflg != UIO_SYSSPACE) uma_zfree(namei_zone, symname); brelse(bp); return (EINVAL); } /* * Don't forget before you leave from home ;-) */ brelse(bp); /* * return with the symbolic name to caller's. */ if (uio->uio_segflg != UIO_SYSSPACE) { error = uiomove(symname, symlen, uio); uma_zfree(namei_zone, symname); return (error); } uio->uio_resid -= symlen; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen; uio->uio_iov->iov_len -= symlen; return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ static int cd9660_strategy(ap) struct vop_strategy_args /* { struct buf *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct iso_node *ip; struct bufobj *bo; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("cd9660_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { bp->b_blkno = (ip->iso_start + bp->b_lblkno) << (ip->i_mnt->im_bshift - DEV_BSHIFT); if ((long)bp->b_blkno == -1) /* XXX: cut&paste junk ? */ clrbuf(bp); } if ((long)bp->b_blkno == -1) { /* XXX: cut&paste junk ? */ bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = ip->i_mnt->im_bo; BO_STRATEGY(bo, bp); return (0); } /* * Return POSIX pathconf information applicable to cd9660 filesystems. */ static int cd9660_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) *ap->a_retval = NAME_MAX; else *ap->a_retval = 37; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Global vfs data structures for cd9660 */ struct vop_vector cd9660_vnodeops = { .vop_default = &default_vnodeops, .vop_open = cd9660_open, .vop_access = cd9660_access, .vop_bmap = cd9660_bmap, .vop_cachedlookup = cd9660_lookup, .vop_getattr = cd9660_getattr, .vop_inactive = cd9660_inactive, .vop_ioctl = cd9660_ioctl, .vop_lookup = vfs_cache_lookup, .vop_pathconf = cd9660_pathconf, .vop_read = cd9660_read, .vop_readdir = cd9660_readdir, .vop_readlink = cd9660_readlink, .vop_reclaim = cd9660_reclaim, .vop_setattr = cd9660_setattr, .vop_strategy = cd9660_strategy, }; /* * Special device vnode ops */ struct vop_vector cd9660_fifoops = { .vop_default = &fifo_specops, .vop_access = cd9660_access, .vop_getattr = cd9660_getattr, .vop_inactive = cd9660_inactive, .vop_reclaim = cd9660_reclaim, .vop_setattr = cd9660_setattr, }; Index: head/sys/gnu/fs/ext2fs/ext2_vfsops.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_vfsops.c (revision 154151) +++ head/sys/gnu/fs/ext2fs/ext2_vfsops.c (revision 154152) @@ -1,1157 +1,1161 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 * $FreeBSD$ */ /*- * COPYRIGHT.INFO says this has some GPL'd code from ext2_super.c in it * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td); static int ext2_mountfs(struct vnode *, struct mount *, struct thread *); static int ext2_reload(struct mount *mp, struct thread *td); static int ext2_sbupdate(struct ext2mount *, int); static vfs_unmount_t ext2_unmount; static vfs_root_t ext2_root; static vfs_statfs_t ext2_statfs; static vfs_sync_t ext2_sync; static vfs_vget_t ext2_vget; static vfs_fhtovp_t ext2_fhtovp; static vfs_vptofh_t ext2_vptofh; static vfs_mount_t ext2_mount; MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part"); static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure"); static struct vfsops ext2fs_vfsops = { .vfs_fhtovp = ext2_fhtovp, .vfs_mount = ext2_mount, .vfs_root = ext2_root, /* root inode via vget */ .vfs_statfs = ext2_statfs, .vfs_sync = ext2_sync, .vfs_unmount = ext2_unmount, .vfs_vget = ext2_vget, .vfs_vptofh = ext2_vptofh, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); #define bsd_malloc malloc #define bsd_free free static int ext2_check_sb_compat(struct ext2_super_block *es, struct cdev *dev, int ronly); static int compute_sb_data(struct vnode * devvp, struct ext2_super_block * es, struct ext2_sb_info * fs); static const char *ext2_opts[] = { "from", "export" }; /* * VFS Operations. * * mount system call */ static int ext2_mount(mp, td) struct mount *mp; struct thread *td; { struct export_args *export; struct vfsoptlist *opts; struct vnode *devvp; struct ext2mount *ump = 0; struct ext2_sb_info *fs; char *path, *fspec; int error, flags, len; mode_t accessmode; struct nameidata nd, *ndp = &nd; opts = mp->mnt_optnew; if (vfs_filteropt(opts, ext2_opts)) return (EINVAL); vfs_getopt(opts, "fspath", (void **)&path, NULL); /* Double-check the length of path.. */ if (strlen(path) >= MAXMNTLEN - 1) return (ENAMETOOLONG); fspec = NULL; error = vfs_getopt(opts, "from", (void **)&fspec, &len); if (!error && fspec[len - 1] != '\0') return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOEXT2(mp); fs = ump->um_e2fs; error = 0; if (fs->s_rd_only == 0 && vfs_flagopt(opts, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT, td); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (vfs_busy(mp, LK_NOWAIT, 0, td)) return (EBUSY); error = ext2_flushfiles(mp, flags, td); vfs_unbusy(mp, td); if (!error && fs->s_wasvalid) { fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } fs->s_rd_only = 1; vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY); DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, td); if (error) return (error); devvp = ump->um_devvp; if (fs->s_rd_only && !vfs_flagopt(opts, "ro", NULL, 0)) { if (ext2_check_sb_compat(fs->s_es, devvp->v_rdev, 0)) return (EPERM); /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (suser(td)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td)) != 0) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); } DROP_GIANT(); g_topology_lock(); error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((fs->s_es->s_state & EXT2_VALID_FS) == 0 || (fs->s_es->s_state & EXT2_ERROR_FS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } fs->s_es->s_state &= ~EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); fs->s_rd_only = 0; mp->mnt_flag &= ~MNT_RDONLY; } if (fspec == NULL) { error = vfs_getopt(opts, "export", (void **)&export, &len); if (error || len != sizeof(struct export_args)) return (EINVAL); /* Process export requests. */ return (vfs_export(mp, export)); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (fspec == NULL) return (EINVAL); NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (suser(td)) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td)) != 0) { vput(devvp); return (error); } } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = ext2_mountfs(devvp, mp, td); } else { if (devvp != ump->um_devvp) { vput(devvp); return (EINVAL); /* needs translation */ } else vput(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOEXT2(mp); fs = ump->um_e2fs; /* * Note that this strncpy() is ok because of a check at the start * of ext2_mount(). */ strncpy(fs->fs_fsmnt, path, MAXMNTLEN); fs->fs_fsmnt[MAXMNTLEN - 1] = '\0'; vfs_mountedfrom(mp, fspec); return (0); } /* * checks that the data in the descriptor blocks make sense * this is taken from ext2/super.c */ static int ext2_check_descriptors (struct ext2_sb_info * sb) { int i; int desc_block = 0; unsigned long block = sb->s_es->s_first_data_block; struct ext2_group_desc * gdp = NULL; /* ext2_debug ("Checking group descriptors"); */ for (i = 0; i < sb->s_groups_count; i++) { /* examine next descriptor block */ if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0) gdp = (struct ext2_group_desc *) sb->s_group_desc[desc_block++]->b_data; if (gdp->bg_block_bitmap < block || gdp->bg_block_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Block bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_block_bitmap); return 0; } if (gdp->bg_inode_bitmap < block || gdp->bg_inode_bitmap >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode bitmap for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_bitmap); return 0; } if (gdp->bg_inode_table < block || gdp->bg_inode_table + sb->s_itb_per_group >= block + EXT2_BLOCKS_PER_GROUP(sb)) { printf ("ext2_check_descriptors: " "Inode table for group %d" " not in group (block %lu)!\n", i, (unsigned long) gdp->bg_inode_table); return 0; } block += EXT2_BLOCKS_PER_GROUP(sb); gdp++; } return 1; } static int ext2_check_sb_compat(es, dev, ronly) struct ext2_super_block *es; struct cdev *dev; int ronly; { if (es->s_magic != EXT2_SUPER_MAGIC) { printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n", devtoname(dev), es->s_magic, EXT2_SUPER_MAGIC); return (1); } if (es->s_rev_level > EXT2_GOOD_OLD_REV) { if (es->s_feature_incompat & ~EXT2_FEATURE_INCOMPAT_SUPP) { printf( "WARNING: mount of %s denied due to unsupported optional features\n", devtoname(dev)); return (1); } if (!ronly && (es->s_feature_ro_compat & ~EXT2_FEATURE_RO_COMPAT_SUPP)) { printf( "WARNING: R/W mount of %s denied due to unsupported optional features\n", devtoname(dev)); return (1); } } return (0); } /* * this computes the fields of the ext2_sb_info structure from the * data in the ext2_super_block structure read in */ static int compute_sb_data(devvp, es, fs) struct vnode * devvp; struct ext2_super_block * es; struct ext2_sb_info * fs; { int db_count, error; int i, j; int logic_sb_block = 1; /* XXX for now */ #if 1 #define V(v) #else #define V(v) printf(#v"= %d\n", fs->v); #endif fs->s_blocksize = EXT2_MIN_BLOCK_SIZE << es->s_log_block_size; V(s_blocksize) fs->s_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->s_log_block_size; V(s_bshift) fs->s_fsbtodb = es->s_log_block_size + 1; V(s_fsbtodb) fs->s_qbmask = fs->s_blocksize - 1; V(s_bmask) fs->s_blocksize_bits = EXT2_BLOCK_SIZE_BITS(es); V(s_blocksize_bits) fs->s_frag_size = EXT2_MIN_FRAG_SIZE << es->s_log_frag_size; V(s_frag_size) if (fs->s_frag_size) fs->s_frags_per_block = fs->s_blocksize / fs->s_frag_size; V(s_frags_per_block) fs->s_blocks_per_group = es->s_blocks_per_group; V(s_blocks_per_group) fs->s_frags_per_group = es->s_frags_per_group; V(s_frags_per_group) fs->s_inodes_per_group = es->s_inodes_per_group; V(s_inodes_per_group) fs->s_inodes_per_block = fs->s_blocksize / EXT2_INODE_SIZE; V(s_inodes_per_block) fs->s_itb_per_group = fs->s_inodes_per_group /fs->s_inodes_per_block; V(s_itb_per_group) fs->s_desc_per_block = fs->s_blocksize / sizeof (struct ext2_group_desc); V(s_desc_per_block) /* s_resuid / s_resgid ? */ fs->s_groups_count = (es->s_blocks_count - es->s_first_data_block + EXT2_BLOCKS_PER_GROUP(fs) - 1) / EXT2_BLOCKS_PER_GROUP(fs); V(s_groups_count) db_count = (fs->s_groups_count + EXT2_DESC_PER_BLOCK(fs) - 1) / EXT2_DESC_PER_BLOCK(fs); fs->s_db_per_group = db_count; V(s_db_per_group) fs->s_group_desc = bsd_malloc(db_count * sizeof (struct buf *), M_EXT2MNT, M_WAITOK); /* adjust logic_sb_block */ if(fs->s_blocksize > SBSIZE) /* Godmar thinks: if the blocksize is greater than 1024, then the superblock is logically part of block zero. */ logic_sb_block = 0; for (i = 0; i < db_count; i++) { error = bread(devvp , fsbtodb(fs, logic_sb_block + i + 1), fs->s_blocksize, NOCRED, &fs->s_group_desc[i]); if(error) { for (j = 0; j < i; j++) brelse(fs->s_group_desc[j]); bsd_free(fs->s_group_desc, M_EXT2MNT); printf("EXT2-fs: unable to read group descriptors (%d)\n", error); return EIO; } LCK_BUF(fs->s_group_desc[i]) } if(!ext2_check_descriptors(fs)) { for (j = 0; j < db_count; j++) ULCK_BUF(fs->s_group_desc[j]) bsd_free(fs->s_group_desc, M_EXT2MNT); printf("EXT2-fs: (ext2_check_descriptors failure) " "unable to read group descriptors\n"); return EIO; } for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) { fs->s_inode_bitmap_number[i] = 0; fs->s_inode_bitmap[i] = NULL; fs->s_block_bitmap_number[i] = 0; fs->s_block_bitmap[i] = NULL; } fs->s_loaded_inode_bitmaps = 0; fs->s_loaded_block_bitmaps = 0; if (es->s_rev_level == EXT2_GOOD_OLD_REV || (es->s_feature_ro_compat & EXT2_FEATURE_RO_COMPAT_LARGE_FILE) == 0) fs->fs_maxfilesize = 0x7fffffff; else fs->fs_maxfilesize = 0x7fffffffffffffff; return 0; } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ext2_reload(struct mount *mp, struct thread *td) { - struct vnode *vp, *nvp, *devvp; + struct vnode *vp, *mvp, *devvp; struct inode *ip; struct buf *bp; struct ext2_super_block * es; struct ext2_sb_info *fs; int error; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOEXT2(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if (vinvalbuf(devvp, 0, td, 0, 0) != 0) panic("ext2_reload: dirty1"); VOP_UNLOCK(devvp, 0, td); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2_super_block *)bp->b_data; if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOEXT2(mp)->um_e2fs; bcopy(bp->b_data, fs->s_es, sizeof(struct ext2_super_block)); if((error = compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return error; } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); loop: MNT_ILOCK(mp); - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + MNT_VNODE_FOREACH_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, td, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->s_blocksize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0, td); vrele(vp); + MNT_VNODE_FOREACH_ABORT(mp, mvp); return (error); } ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number)), ip); brelse(bp); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Common code for mount and mountroot */ static int ext2_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ext2mount *ump; struct buf *bp; struct ext2_sb_info *fs; struct ext2_super_block * es; struct cdev *dev = devvp->v_rdev; struct g_consumer *cp; struct bufobj *bo; int error; int ronly; ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0); /* XXX: use VOP_ACESS to check FS perms */ DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return (error); /* XXX: should we check for some sectorsize or 512 instead? */ if (((SBSIZE % cp->provider->sectorsize) != 0) || (SBSIZE < cp->provider->sectorsize)) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); return (EINVAL); } bo = &devvp->v_bufobj; bo->bo_private = cp; bo->bo_ops = g_vfs_bufops; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2_super_block *)bp->b_data; if (ext2_check_sb_compat(es, dev, ronly) != 0) { error = EINVAL; /* XXX needs translation */ goto out; } if ((es->s_state & EXT2_VALID_FS) == 0 || (es->s_state & EXT2_ERROR_FS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = bsd_malloc(sizeof *ump, M_EXT2MNT, M_WAITOK); bzero((caddr_t)ump, sizeof *ump); /* I don't know whether this is the right strategy. Note that we dynamically allocate both an ext2_sb_info and an ext2_super_block while Linux keeps the super block in a locked buffer */ ump->um_e2fs = bsd_malloc(sizeof(struct ext2_sb_info), M_EXT2MNT, M_WAITOK); ump->um_e2fs->s_es = bsd_malloc(sizeof(struct ext2_super_block), M_EXT2MNT, M_WAITOK); bcopy(es, ump->um_e2fs->s_es, (u_int)sizeof(struct ext2_super_block)); if ((error = compute_sb_data(devvp, ump->um_e2fs->s_es, ump->um_e2fs))) goto out; /* * We don't free the group descriptors allocated by compute_sb_data() * until ext2_unmount(). This is OK since the mount will succeed. */ brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->s_rd_only = ronly; /* ronly is set according to mnt_flags */ /* if the fs is not mounted read-only, make sure the super block is always written back on a sync() */ fs->s_wasvalid = fs->s_es->s_state & EXT2_VALID_FS ? 1 : 0; if (ronly == 0) { fs->s_dirt = 1; /* mark it modified */ fs->s_es->s_state &= ~EXT2_VALID_FS; /* set fs invalid */ } mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; mp->mnt_flag |= MNT_LOCAL; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_bo = &devvp->v_bufobj; ump->um_cp = cp; /* setting those two parameters allowed us to use ufs_bmap w/o changse ! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = fs->s_es->s_log_block_size + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { bsd_free(ump->um_e2fs->s_es, M_EXT2MNT); bsd_free(ump->um_e2fs, M_EXT2MNT); bsd_free(ump, M_EXT2MNT); mp->mnt_data = (qaddr_t)0; } return (error); } /* * unmount system call */ static int ext2_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct ext2mount *ump; struct ext2_sb_info *fs; int error, flags, ronly, i; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, td)) != 0) return (error); ump = VFSTOEXT2(mp); fs = ump->um_e2fs; ronly = fs->s_rd_only; if (ronly == 0) { if (fs->s_wasvalid) fs->s_es->s_state |= EXT2_VALID_FS; ext2_sbupdate(ump, MNT_WAIT); } /* release buffers containing group descriptors */ for(i = 0; i < fs->s_db_per_group; i++) ULCK_BUF(fs->s_group_desc[i]) bsd_free(fs->s_group_desc, M_EXT2MNT); /* release cached inode/block bitmaps */ for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_inode_bitmap[i]) ULCK_BUF(fs->s_inode_bitmap[i]) for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) if (fs->s_block_bitmap[i]) ULCK_BUF(fs->s_block_bitmap[i]) DROP_GIANT(); g_topology_lock(); g_vfs_close(ump->um_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); bsd_free(fs->s_es, M_EXT2MNT); bsd_free(fs, M_EXT2MNT); bsd_free(ump, M_EXT2MNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { int error; error = vflush(mp, 0, flags, td); return (error); } /* * Get file system statistics. * taken from ext2/super.c ext2_statfs */ static int ext2_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { unsigned long overhead; struct ext2mount *ump; struct ext2_sb_info *fs; struct ext2_super_block *es; int i, nsb; ump = VFSTOEXT2(mp); fs = ump->um_e2fs; es = fs->s_es; if (es->s_magic != EXT2_SUPER_MAGIC) panic("ext2_statfs - magic number spoiled"); /* * Compute the overhead (FS structures) */ if (es->s_feature_ro_compat & EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER) { nsb = 0; for (i = 0 ; i < fs->s_groups_count; i++) if (ext2_group_sparse(i)) nsb++; } else nsb = fs->s_groups_count; overhead = es->s_first_data_block + /* Superblocks and block group descriptors: */ nsb * (1 + fs->s_db_per_group) + /* Inode bitmap, block bitmap, and inode table: */ fs->s_groups_count * (1 + 1 + fs->s_itb_per_group); sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = es->s_blocks_count - overhead; sbp->f_bfree = es->s_free_blocks_count; sbp->f_bavail = sbp->f_bfree - es->s_r_blocks_count; sbp->f_files = es->s_inodes_count; sbp->f_ffree = es->s_free_inodes_count; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { - struct vnode *nvp, *vp; + struct vnode *mvp, *vp; struct inode *ip; struct ext2mount *ump = VFSTOEXT2(mp); struct ext2_sb_info *fs; int error, allerror = 0; fs = ump->um_e2fs; if (fs->s_dirt != 0 && fs->s_rd_only != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ext2_sync: rofs mod"); } /* * Write back each (modified) inode. */ MNT_ILOCK(mp); loop: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED)) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { MNT_ILOCK(mp); - if (error == ENOENT) + if (error == ENOENT) { + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; + } continue; } if ((error = VOP_FSYNC(vp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * Force stale file system control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp, 0, td); } /* * Write back modified superblock. */ if (fs->s_dirt != 0) { fs->s_dirt = 0; fs->s_es->s_wtime = time_second; if ((error = ext2_sbupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { struct ext2_sb_info *fs; struct inode *ip; struct ext2mount *ump; struct buf *bp; struct vnode *vp; struct cdev *dev; int i, error; int used_blocks; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); ump = VFSTOEXT2(mp); dev = ump->um_dev; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ext2_sync() if a sync happens to fire right then, * which will cause a panic because ext2_sync() blindly * dereferences vp->v_data (as well it should). */ ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) { *vpp = NULL; free(ip, M_EXT2NODE); return (error); } vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_number = ino; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ #if 0 printf("ext2_vget(%d) dbn= %d ", ino, fsbtodb(fs, ino_to_fsba(fs, ino))); #endif if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->s_blocksize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ vput(vp); brelse(bp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ ext2_ei2i((struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ino)), ip); ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; ip->i_prealloc_count = 0; ip->i_prealloc_block = 0; /* now we want to make sure that block pointers for unused blocks are zeroed out - ext2_balloc depends on this although for regular files and directories only */ if(S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode)) { used_blocks = (ip->i_size+fs->s_blocksize-1) / fs->s_blocksize; for(i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } /* ext2_print_inode(ip); */ brelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = ump->um_devvp; /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct inode *ip; struct ufid *ufhp; struct vnode *nvp; struct ext2_sb_info *fs; int error; ufhp = (struct ufid *)fhp; fs = VFSTOEXT2(mp)->um_e2fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino > fs->s_groups_count * fs->s_es->s_inodes_per_group) return (ESTALE); error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp); if (error) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, 0, curthread); return (0); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { struct inode *ip; struct ufid *ufhp; ip = VTOI(vp); ufhp = (struct ufid *)fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(mp, waitfor) struct ext2mount *mp; int waitfor; { struct ext2_sb_info *fs = mp->um_e2fs; struct ext2_super_block *es = fs->s_es; struct buf *bp; int error = 0; /* printf("\nupdating superblock, waitfor=%s\n", waitfor == MNT_WAIT ? "yes":"no"); */ bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2_super_block)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here */ return (error); } /* * Return the root of a filesystem. */ static int ext2_root(mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { struct vnode *nvp; int error; error = VFS_VGET(mp, (ino_t)ROOTINO, LK_EXCLUSIVE, &nvp); if (error) return (error); *vpp = nvp; return (0); } Index: head/sys/isofs/cd9660/cd9660_vnops.c =================================================================== --- head/sys/isofs/cd9660/cd9660_vnops.c (revision 154151) +++ head/sys/isofs/cd9660/cd9660_vnops.c (revision 154152) @@ -1,832 +1,833 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vnops.c 8.19 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static vop_setattr_t cd9660_setattr; static vop_open_t cd9660_open; static vop_access_t cd9660_access; static vop_getattr_t cd9660_getattr; static vop_ioctl_t cd9660_ioctl; static vop_pathconf_t cd9660_pathconf; static vop_read_t cd9660_read; struct isoreaddir; static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off); static int iso_shipdir(struct isoreaddir *idp); static vop_readdir_t cd9660_readdir; static vop_readlink_t cd9660_readlink; static vop_strategy_t cd9660_strategy; /* * Setattr call. Only allowed for block and character special devices. */ static int cd9660_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) return (EROFS); if (vap->va_size != (u_quad_t)VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: return (EROFS); case VCHR: case VBLK: case VSOCK: case VFIFO: case VNON: case VBAD: + case VMARKER: return (0); } } return (0); } /* * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. * The mode is shifted to select the owner/group/other fields. The * super user is granted all permissions. */ /* ARGSUSED */ static int cd9660_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); mode_t mode = ap->a_mode; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); /* * Disallow write attempts unless the file is a socket, * fifo, or a block or character device resident on the * filesystem. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: return (EROFS); /* NOT REACHED */ default: break; } } return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid, ip->inode.iso_gid, ap->a_mode, ap->a_cred, NULL)); } static int cd9660_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; } */ *ap; { struct iso_node *ip = VTOI(ap->a_vp); vnode_create_vobject(ap->a_vp, ip->i_size, ap->a_td); return 0; } static int cd9660_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct iso_node *ip = VTOI(vp); vap->va_fsid = dev2udev(ip->i_mnt->im_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->inode.iso_mode; vap->va_nlink = ip->inode.iso_links; vap->va_uid = ip->inode.iso_uid; vap->va_gid = ip->inode.iso_gid; vap->va_atime = ip->inode.iso_atime; vap->va_mtime = ip->inode.iso_mtime; vap->va_ctime = ip->inode.iso_ctime; vap->va_rdev = ip->inode.iso_rdev; vap->va_size = (u_quad_t) ip->i_size; if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) { struct vop_readlink_args rdlnk; struct iovec aiov; struct uio auio; char *cp; MALLOC(cp, char *, MAXPATHLEN, M_TEMP, M_WAITOK); aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = ap->a_td; auio.uio_resid = MAXPATHLEN; rdlnk.a_uio = &auio; rdlnk.a_vp = ap->a_vp; rdlnk.a_cred = ap->a_cred; if (cd9660_readlink(&rdlnk) == 0) vap->va_size = MAXPATHLEN - auio.uio_resid; FREE(cp, M_TEMP); } vap->va_flags = 0; vap->va_gen = 1; vap->va_blocksize = ip->i_mnt->logical_block_size; vap->va_bytes = (u_quad_t) ip->i_size; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Vnode op for ioctl. */ static int cd9660_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); switch (ap->a_command) { case FIOGETLBA: *(int *)(ap->a_data) = ip->iso_start; return 0; default: return (ENOTTY); } } /* * Vnode op for reading. */ static int cd9660_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct iso_node *ip = VTOI(vp); struct iso_mnt *imp; struct buf *bp; daddr_t lbn, rablock; off_t diff; int rasize, error = 0; int seqcount; long size, n, on; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); seqcount = ap->a_ioflag >> IO_SEQSHIFT; if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); ip->i_flag |= IN_ACCESS; imp = ip->i_mnt; do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); n = min((u_int)(imp->logical_block_size - on), uio->uio_resid); diff = (off_t)ip->i_size - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = blksize(imp, ip, lbn); rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, (ap->a_ioflag >> 16), &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { if (seqcount > 1 && lblktosize(imp, rablock) < ip->i_size) { rasize = blksize(imp, ip, rablock); error = breadn(vp, lbn, size, &rablock, &rasize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); } n = min(n, size - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); } /* * Structure for reading directories */ struct isoreaddir { struct dirent saveent; struct dirent assocent; struct dirent current; off_t saveoff; off_t assocoff; off_t curroff; struct uio *uio; off_t uio_off; int eofflag; u_long *cookies; int ncookies; }; static int iso_uiodir(idp,dp,off) struct isoreaddir *idp; struct dirent *dp; off_t off; { int error; dp->d_name[dp->d_namlen] = 0; dp->d_reclen = GENERIC_DIRSIZ(dp); if (idp->uio->uio_resid < dp->d_reclen) { idp->eofflag = 0; return (-1); } if (idp->cookies) { if (idp->ncookies <= 0) { idp->eofflag = 0; return (-1); } *idp->cookies++ = off; --idp->ncookies; } if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0) return (error); idp->uio_off = off; return (0); } static int iso_shipdir(idp) struct isoreaddir *idp; { struct dirent *dp; int cl, sl, assoc; int error; char *cname, *sname; cl = idp->current.d_namlen; cname = idp->current.d_name; assoc = (cl > 1) && (*cname == ASSOCCHAR); if (assoc) { cl--; cname++; } dp = &idp->saveent; sname = dp->d_name; if (!(sl = dp->d_namlen)) { dp = &idp->assocent; sname = dp->d_name + 1; sl = dp->d_namlen - 1; } if (sl > 0) { if (sl != cl || bcmp(sname,cname,sl)) { if (idp->assocent.d_namlen) { if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0) return (error); idp->assocent.d_namlen = 0; } if (idp->saveent.d_namlen) { if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0) return (error); idp->saveent.d_namlen = 0; } } } idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current); if (assoc) { idp->assocoff = idp->curroff; bcopy(&idp->current,&idp->assocent,idp->current.d_reclen); } else { idp->saveoff = idp->curroff; bcopy(&idp->current,&idp->saveent,idp->current.d_reclen); } return (0); } /* * Vnode op for readdir */ static int cd9660_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct uio *uio = ap->a_uio; struct isoreaddir *idp; struct vnode *vdp = ap->a_vp; struct iso_node *dp; struct iso_mnt *imp; struct buf *bp = NULL; struct iso_directory_record *ep; int entryoffsetinblock; doff_t endsearch; u_long bmask; int error = 0; int reclen; u_short namelen; int ncookies = 0; u_long *cookies = NULL; dp = VTOI(vdp); imp = dp->i_mnt; bmask = imp->im_bmask; MALLOC(idp, struct isoreaddir *, sizeof(*idp), M_TEMP, M_WAITOK); idp->saveent.d_namlen = idp->assocent.d_namlen = 0; /* * XXX * Is it worth trying to figure out the type? */ idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type = DT_UNKNOWN; idp->uio = uio; if (ap->a_ncookies == NULL) { idp->cookies = NULL; } else { /* * Guess the number of cookies needed. */ ncookies = uio->uio_resid / 16; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); idp->cookies = cookies; idp->ncookies = ncookies; } idp->eofflag = 1; idp->curroff = uio->uio_offset; if ((entryoffsetinblock = idp->curroff & bmask) && (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) { FREE(idp, M_TEMP); return (error); } endsearch = dp->i_size; while (idp->curroff < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if ((idp->curroff & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0) break; entryoffsetinblock = 0; } /* * Get pointer to next entry. */ ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { /* skip to next block, if any */ idp->curroff = (idp->curroff & ~bmask) + imp->logical_block_size; continue; } if (reclen < ISO_DIRECTORY_RECORD_SIZE) { error = EINVAL; /* illegal entry, stop */ break; } if (entryoffsetinblock + reclen > imp->logical_block_size) { error = EINVAL; /* illegal directory, so stop looking */ break; } idp->current.d_namlen = isonum_711(ep->name_len); if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) { error = EINVAL; /* illegal entry, stop */ break; } if (isonum_711(ep->flags)&2) idp->current.d_fileno = isodirino(ep, imp); else idp->current.d_fileno = dbtob(bp->b_blkno) + entryoffsetinblock; idp->curroff += reclen; switch (imp->iso_ftype) { case ISO_FTYPE_RRIP: cd9660_rrip_getname(ep,idp->current.d_name, &namelen, &idp->current.d_fileno,imp); idp->current.d_namlen = (u_char)namelen; if (idp->current.d_namlen) error = iso_uiodir(idp,&idp->current,idp->curroff); break; default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/ strcpy(idp->current.d_name,".."); if (idp->current.d_namlen == 1 && ep->name[0] == 0) { idp->current.d_namlen = 1; error = iso_uiodir(idp,&idp->current,idp->curroff); } else if (idp->current.d_namlen == 1 && ep->name[0] == 1) { idp->current.d_namlen = 2; error = iso_uiodir(idp,&idp->current,idp->curroff); } else { isofntrans(ep->name,idp->current.d_namlen, idp->current.d_name, &namelen, imp->iso_ftype == ISO_FTYPE_9660, isonum_711(ep->flags)&4, imp->joliet_level, imp->im_flags, imp->im_d2l); idp->current.d_namlen = (u_char)namelen; if (imp->iso_ftype == ISO_FTYPE_DEFAULT) error = iso_shipdir(idp); else error = iso_uiodir(idp,&idp->current,idp->curroff); } } if (error) break; entryoffsetinblock += reclen; } if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) { idp->current.d_namlen = 0; error = iso_shipdir(idp); } if (error < 0) error = 0; if (ap->a_ncookies != NULL) { if (error) free(cookies, M_TEMP); else { /* * Work out the number of cookies actually used. */ *ap->a_ncookies = ncookies - idp->ncookies; *ap->a_cookies = cookies; } } if (bp) brelse (bp); uio->uio_offset = idp->uio_off; *ap->a_eofflag = idp->eofflag; FREE(idp, M_TEMP); return (error); } /* * Return target name of a symbolic link * Shouldn't we get the parent vnode and read the data from there? * This could eventually result in deadlocks in cd9660_lookup. * But otherwise the block read here is in the block buffer two times. */ typedef struct iso_directory_record ISODIR; typedef struct iso_node ISONODE; typedef struct iso_mnt ISOMNT; static int cd9660_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { ISONODE *ip; ISODIR *dirp; ISOMNT *imp; struct buf *bp; struct uio *uio; u_short symlen; int error; char *symname; ip = VTOI(ap->a_vp); imp = ip->i_mnt; uio = ap->a_uio; if (imp->iso_ftype != ISO_FTYPE_RRIP) return (EINVAL); /* * Get parents directory record block that this inode included. */ error = bread(imp->im_devvp, (ip->i_number >> imp->im_bshift) << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { brelse(bp); return (EINVAL); } /* * Setup the directory pointer for this inode */ dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask)); /* * Just make sure, we have a right one.... * 1: Check not cross boundary on block */ if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) > (unsigned)imp->logical_block_size) { brelse(bp); return (EINVAL); } /* * Now get a buffer * Abuse a namei buffer for now. */ if (uio->uio_segflg == UIO_SYSSPACE) symname = uio->uio_iov->iov_base; else symname = uma_zalloc(namei_zone, M_WAITOK); /* * Ok, we just gathering a symbolic name in SL record. */ if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) { if (uio->uio_segflg != UIO_SYSSPACE) uma_zfree(namei_zone, symname); brelse(bp); return (EINVAL); } /* * Don't forget before you leave from home ;-) */ brelse(bp); /* * return with the symbolic name to caller's. */ if (uio->uio_segflg != UIO_SYSSPACE) { error = uiomove(symname, symlen, uio); uma_zfree(namei_zone, symname); return (error); } uio->uio_resid -= symlen; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen; uio->uio_iov->iov_len -= symlen; return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ static int cd9660_strategy(ap) struct vop_strategy_args /* { struct buf *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct iso_node *ip; struct bufobj *bo; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("cd9660_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { bp->b_blkno = (ip->iso_start + bp->b_lblkno) << (ip->i_mnt->im_bshift - DEV_BSHIFT); if ((long)bp->b_blkno == -1) /* XXX: cut&paste junk ? */ clrbuf(bp); } if ((long)bp->b_blkno == -1) { /* XXX: cut&paste junk ? */ bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = ip->i_mnt->im_bo; BO_STRATEGY(bo, bp); return (0); } /* * Return POSIX pathconf information applicable to cd9660 filesystems. */ static int cd9660_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) *ap->a_retval = NAME_MAX; else *ap->a_retval = 37; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Global vfs data structures for cd9660 */ struct vop_vector cd9660_vnodeops = { .vop_default = &default_vnodeops, .vop_open = cd9660_open, .vop_access = cd9660_access, .vop_bmap = cd9660_bmap, .vop_cachedlookup = cd9660_lookup, .vop_getattr = cd9660_getattr, .vop_inactive = cd9660_inactive, .vop_ioctl = cd9660_ioctl, .vop_lookup = vfs_cache_lookup, .vop_pathconf = cd9660_pathconf, .vop_read = cd9660_read, .vop_readdir = cd9660_readdir, .vop_readlink = cd9660_readlink, .vop_reclaim = cd9660_reclaim, .vop_setattr = cd9660_setattr, .vop_strategy = cd9660_strategy, }; /* * Special device vnode ops */ struct vop_vector cd9660_fifoops = { .vop_default = &fifo_specops, .vop_access = cd9660_access, .vop_getattr = cd9660_getattr, .vop_inactive = cd9660_inactive, .vop_reclaim = cd9660_reclaim, .vop_setattr = cd9660_setattr, }; Index: head/sys/kern/vfs_default.c =================================================================== --- head/sys/kern/vfs_default.c (revision 154151) +++ head/sys/kern/vfs_default.c (revision 154152) @@ -1,658 +1,660 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vop_nolookup(struct vop_lookup_args *); static int vop_nostrategy(struct vop_strategy_args *); /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. * * If there is no specific entry here, we will return EOPNOTSUPP. * */ struct vop_vector default_vnodeops = { .vop_default = NULL, .vop_bypass = VOP_EOPNOTSUPP, .vop_advlock = VOP_EINVAL, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, .vop_getpages = vop_stdgetpages, .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_ioctl = VOP_ENOTTY, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, .vop_lease = VOP_NULL, .vop_lock = vop_stdlock, .vop_lookup = vop_nolookup, .vop_open = VOP_NULL, .vop_pathconf = VOP_EINVAL, .vop_poll = vop_nopoll, .vop_putpages = vop_stdputpages, .vop_readlink = VOP_EINVAL, .vop_revoke = VOP_PANIC, .vop_strategy = vop_nostrategy, .vop_unlock = vop_stdunlock, }; /* * Series of placeholder functions for various error returns for * VOPs. */ int vop_eopnotsupp(struct vop_generic_args *ap) { /* printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); */ return (EOPNOTSUPP); } int vop_ebadf(struct vop_generic_args *ap) { return (EBADF); } int vop_enotty(struct vop_generic_args *ap) { return (ENOTTY); } int vop_einval(struct vop_generic_args *ap) { return (EINVAL); } int vop_null(struct vop_generic_args *ap) { return (0); } /* * Helper function to panic on some bad VOPs in some filesystems. */ int vop_panic(struct vop_generic_args *ap) { panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); } /* * vop_std and vop_no are default functions for use by * filesystems that need the "default reasonable" implementation for a * particular operation. * * The documentation for the operations they implement exists (if it exists) * in the VOP_(9) manpage (all uppercase). */ /* * Default vop for filesystems that do not support name lookup */ static int vop_nolookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * vop_nostrategy: * * Strategy routine for VFS devices that have none. * * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy * routine. Typically this is done for a BIO_READ strategy call. * Typically B_INVAL is assumed to already be clear prior to a write * and should not be cleared manually unless you just made the buffer * invalid. BIO_ERROR should be cleared either way. */ static int vop_nostrategy (struct vop_strategy_args *ap) { printf("No strategy for buffer at %p\n", ap->a_bp); vprint("vnode", ap->a_vp); ap->a_bp->b_ioflags |= BIO_ERROR; ap->a_bp->b_error = EOPNOTSUPP; bufdone(ap->a_bp); return (EOPNOTSUPP); } /* * vop_stdpathconf: * * Standard implementation of POSIX pathconf, to get information about limits * for a filesystem. * Override per filesystem for the case where the filesystem has smaller * limits. */ int vop_stdpathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return (0); case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Standard lock, unlock and islocked functions. */ int vop_stdlock(ap) struct vop_lock_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; return (lockmgr(vp->v_vnlock, ap->a_flags, VI_MTX(vp), ap->a_td)); } /* See above. */ int vop_stdunlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; return (lockmgr(vp->v_vnlock, ap->a_flags | LK_RELEASE, VI_MTX(vp), ap->a_td)); } /* See above. */ int vop_stdislocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { return (lockstatus(ap->a_vp->v_vnlock, ap->a_td)); } /* * Return true for select/poll. */ int vop_nopoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { /* * Return true for read/write. If the user asked for something * special, return POLLNVAL, so that clients have a way of * determining reliably whether or not the extended * functionality is present without hard-coding knowledge * of specific filesystem implementations. * Stay in sync with kern_conf.c::no_poll(). */ if (ap->a_events & ~POLLSTANDARD) return (POLLNVAL); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Implement poll for local filesystems that support it. */ int vop_stdpoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_events & ~POLLSTANDARD) return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Return our mount point, as we will take charge of the writes. */ int vop_stdgetwritemount(ap) struct vop_getwritemount_args /* { struct vnode *a_vp; struct mount **a_mpp; } */ *ap; { *(ap->a_mpp) = ap->a_vp->v_mount; return (0); } /* XXX Needs good comment and VOP_BMAP(9) manpage */ int vop_stdbmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } int vop_stdfsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct buf *bp; struct bufobj *bo; struct buf *nbp; int error = 0; int maxretry = 1000; /* large, arbitrarily chosen */ VI_LOCK(vp); loop1: /* * MARK/SCAN initialization to avoid infinite loops. */ TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { bp->b_vflags &= ~BV_SCANNED; bp->b_error = 0; } /* * Flush all dirty buffers associated with a vnode. */ loop2: TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; VI_UNLOCK(vp); KASSERT(bp->b_bufobj == &vp->v_bufobj, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, &vp->v_bufobj)); if ((bp->b_flags & B_DELWRI) == 0) panic("fsync: not dirty"); if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { vfs_bio_awrite(bp); } else { bremfree(bp); bawrite(bp); } VI_LOCK(vp); goto loop2; } /* * If synchronous the caller expects us to completely resolve all * dirty buffers in the system. Wait for in-progress I/O to * complete (which could include background bitmap writes), then * retry if dirty blocks still exist. */ if (ap->a_waitfor == MNT_WAIT) { bo = &vp->v_bufobj; bufobj_wwait(bo, 0, 0); if (bo->bo_dirty.bv_cnt > 0) { /* * If we are unable to write any of these buffers * then we fail now rather than trying endlessly * to write them out. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) if ((error = bp->b_error) == 0) continue; if (error == 0 && --maxretry >= 0) goto loop1; error = EAGAIN; } } VI_UNLOCK(vp); if (error == EAGAIN) vprint("fsync: giving up on dirty", vp); return (error); } /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ int vop_stdgetpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; vm_ooffset_t a_offset; } */ *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } int vop_stdkqfilter(struct vop_kqfilter_args *ap) { return vfs_kqfilter(ap); } /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ int vop_stdputpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; } */ *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. */ int vfs_stdroot (mp, flags, vpp, td) struct mount *mp; int flags; struct vnode **vpp; struct thread *td; { return (EOPNOTSUPP); } int vfs_stdstatfs (mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { return (EOPNOTSUPP); } int vfs_stdvptofh (vp, fhp) struct vnode *vp; struct fid *fhp; { return (EOPNOTSUPP); } int vfs_stdquotactl (mp, cmds, uid, arg, td) struct mount *mp; int cmds; uid_t uid; void *arg; struct thread *td; { return (EOPNOTSUPP); } int vfs_stdsync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { - struct vnode *vp, *nvp; + struct vnode *vp, *mvp; int error, lockreq, allerror = 0; lockreq = LK_EXCLUSIVE | LK_INTERLOCK; if (waitfor != MNT_WAIT) lockreq |= LK_NOWAIT; /* * Force stale buffer cache information to be flushed. */ MNT_ILOCK(mp); loop: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); if ((error = vget(vp, lockreq, td)) != 0) { MNT_ILOCK(mp); - if (error == ENOENT) + if (error == ENOENT) { + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; + } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (allerror); } int vfs_stdnosync (mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { return (0); } int vfs_stdvget (mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdfhtovp (mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdinit (vfsp) struct vfsconf *vfsp; { return (0); } int vfs_stduninit (vfsp) struct vfsconf *vfsp; { return(0); } int vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td) struct mount *mp; int cmd; struct vnode *filename_vp; int attrnamespace; const char *attrname; struct thread *td; { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0, td); return (EOPNOTSUPP); } int vfs_stdsysctl(mp, op, req) struct mount *mp; fsctlop_t op; struct sysctl_req *req; { return (EOPNOTSUPP); } /* end of vfs default ops */ Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c (revision 154151) +++ head/sys/kern/vfs_mount.c (revision 154152) @@ -1,1904 +1,2002 @@ /*- * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_rootdevname.h" #include "opt_ddb.h" #include "opt_mac.h" #ifdef DDB #include #endif #define ROOTNAME "root_device" #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, int fsflags, void *fsdata); static struct mount *vfs_mount_alloc(struct vnode *dvp, struct vfsconf *vfsp, const char *fspath, struct thread *td); static int vfs_mountroot_ask(void); static int vfs_mountroot_try(const char *mountfrom); static int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions); static void free_mntarg(struct mntarg *ma); static void vfs_mount_destroy(struct mount *, struct thread *); static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); +MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; }; /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store */ /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "rdonly", "ro", "rw", "suid", "exec", NULL }; /* * The root specifiers we will try if RB_CDROM is specified. */ static char *cdrom_rootdevnames[] = { "cd9660:cd0", "cd9660:acd0", NULL }; /* legacy find-root code */ char *rootdevnames[2] = {NULL, NULL}; #ifndef ROOTDEVNAME # define ROOTDEVNAME NULL #endif static const char *ctrootdevname = ROOTDEVNAME; /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); #ifdef INVARIANTS else if (opt->len != 0) panic("%s: mount option with NULL value but length != 0", __func__); #endif free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ static void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ static int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused; unsigned int i, iovcnt; int error, namelen, optlen; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *new; TAILQ_FOREACH(opt, opts, link) { /* * Check that this option hasn't been redefined * nor cancelled with a "no" mount option. */ opt2 = TAILQ_FIRST(toopts); while (opt2 != NULL) { if (strcmp(opt2->name, opt->name) == 0) goto next; if (strncmp(opt2->name, "no", 2) == 0 && strcmp(opt2->name + 2, opt->name) == 0) { vfs_freeopt(toopts, opt2); goto next; } opt2 = TAILQ_NEXT(opt2, link); } /* We want this option, duplicate it. */ new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK); strcpy(new->name, opt->name); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else { new->value = NULL; } new->len = opt->len; TAILQ_INSERT_TAIL(toopts, new, link); next: continue; } } /* * --------------------------------------------------------------------- * Mount a filesystem */ int nmount(td, uap) struct thread *td; struct nmount_args /* { struct iovec *iovp; unsigned int iovcnt; int flags; } */ *uap; { struct uio *auio; struct iovec *iov; unsigned int i; int error; u_int iovcnt; /* Kick out MNT_ROOTFS early as it is legal internally */ if (uap->flags & MNT_ROOTFS) return (EINVAL); iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) return (EINVAL); error = copyinuio(uap->iovp, iovcnt, &auio); if (error) return (error); iov = auio->uio_iov; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > MMAXOPTIONLEN) { free(auio, M_IOV); return (EINVAL); } iov++; } error = vfs_donmount(td, uap->flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ /* * Allocate and initialize the mount point struct. */ static struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct thread *td) { struct mount *mp; mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); vfs_busy(mp, LK_NOWAIT, 0, td); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(td->td_ucred); mp->mnt_stat.f_owner = td->td_ucred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_init_mount(mp); mac_create_mount(td->td_ucred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ static void vfs_mount_destroy(struct mount *mp, struct thread *td) { + MNT_ILOCK(mp); + if (mp->mnt_holdcnt != 0) { + printf("Waiting for mount point to be unheld\n"); + while (mp->mnt_holdcnt != 0) { + mp->mnt_holdcntwaiters++; + msleep(&mp->mnt_holdcnt, MNT_MTX(mp), + PZERO, "mntdestroy", 0); + mp->mnt_holdcntwaiters--; + } + printf("mount point unheld\n"); + } + if (mp->mnt_writeopcount > 0) { + printf("Waiting for mount point write ops\n"); + while (mp->mnt_writeopcount > 0) { + mp->mnt_kern_flag |= MNTK_SUSPEND; + msleep(&mp->mnt_writeopcount, + MNT_MTX(mp), + PZERO, "mntdestroy2", 0); + } + printf("mount point write ops completed\n"); + } + MNT_IUNLOCK(mp); mp->mnt_vfc->vfc_refcount--; if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) panic("unmount: dangling vnode"); vfs_unbusy(mp,td); lockdestroy(&mp->mnt_lock); MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); + if (mp->mnt_writeopcount != 0) + panic("vfs_mount_destroy: nonzero writeopcount"); + if (mp->mnt_nvnodelistsize != 0) + panic("vfs_mount_destroy: nonzero nvnodelistsize"); + mp->mnt_writeopcount = -1000; + mp->mnt_nvnodelistsize = -1000; MNT_IUNLOCK(mp); mtx_destroy(&mp->mnt_mtx); #ifdef MAC mac_destroy_mount(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); free(mp, M_MOUNT); } static int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; errmsg_len = 0; errmsg_pos = -1; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); else errmsg_len = 0; /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ if (vfs_getopt(optlist, "update", NULL, NULL) == 0) fsflags |= MNT_UPDATE; if (vfs_getopt(optlist, "async", NULL, NULL) == 0) fsflags |= MNT_ASYNC; if (vfs_getopt(optlist, "force", NULL, NULL) == 0) fsflags |= MNT_FORCE; if (vfs_getopt(optlist, "multilabel", NULL, NULL) == 0) fsflags |= MNT_MULTILABEL; if (vfs_getopt(optlist, "noasync", NULL, NULL) == 0) fsflags &= ~MNT_ASYNC; if (vfs_getopt(optlist, "noatime", NULL, NULL) == 0) fsflags |= MNT_NOATIME; if (vfs_getopt(optlist, "noclusterr", NULL, NULL) == 0) fsflags |= MNT_NOCLUSTERR; if (vfs_getopt(optlist, "noclusterw", NULL, NULL) == 0) fsflags |= MNT_NOCLUSTERW; if (vfs_getopt(optlist, "noexec", NULL, NULL) == 0) fsflags |= MNT_NOEXEC; if (vfs_getopt(optlist, "nosuid", NULL, NULL) == 0) fsflags |= MNT_NOSUID; if (vfs_getopt(optlist, "nosymfollow", NULL, NULL) == 0) fsflags |= MNT_NOSYMFOLLOW; if (vfs_getopt(optlist, "noro", NULL, NULL) == 0) fsflags &= ~MNT_RDONLY; if (vfs_getopt(optlist, "ro", NULL, NULL) == 0) fsflags |= MNT_RDONLY; if (vfs_getopt(optlist, "rdonly", NULL, NULL) == 0) fsflags |= MNT_RDONLY; if (vfs_getopt(optlist, "rw", NULL, NULL) == 0) fsflags &= ~MNT_RDONLY; if (vfs_getopt(optlist, "snapshot", NULL, NULL) == 0) fsflags |= MNT_SNAPSHOT; if (vfs_getopt(optlist, "suiddir", NULL, NULL) == 0) fsflags |= MNT_SUIDDIR; if (vfs_getopt(optlist, "sync", NULL, NULL) == 0) fsflags |= MNT_SYNCHRONOUS; if (vfs_getopt(optlist, "union", NULL, NULL) == 0) fsflags |= MNT_UNION; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) { error = ENAMETOOLONG; goto bail; } mtx_lock(&Giant); error = vfs_domount(td, fstype, fspath, fsflags, optlist); mtx_unlock(&Giant); bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { strncpy(fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copystr(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len, NULL); } } if (error != 0) vfs_freeopts(optlist); return (error); } /* * --------------------------------------------------------------------- * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int mount(td, uap) struct thread *td; struct mount_args /* { char *type; char *path; int flags; caddr_t data; } */ *uap; { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; int error; /* Kick out MNT_ROOTFS early as it is legal internally */ uap->flags &= ~MNT_ROOTFS; if (uap->data == NULL) return (EINVAL); fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (!error) { mtx_lock(&Giant); /* XXX ? */ vfsp = vfs_byname_kld(fstype, td, &error); mtx_unlock(&Giant); } free(fstype, M_TEMP); if (error) return (error); if (vfsp == NULL) return (ENOENT); if (vfsp->vfc_vfsops->vfs_cmount == NULL) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec"); error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td); return (error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Flags common to all filesystems. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ int fsflags, /* Flags common to all filesystems. */ void *fsdata /* Options local to the filesystem. */ ) { struct vnode *vp; struct mount *mp; struct vfsconf *vfsp; int error, flag = 0, kern_flag = 0; struct vattr va; struct nameidata nd; mtx_assert(&Giant, MA_OWNED); /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred)) return (EPERM); if (usermount == 0) { if ((error = suser(td)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & (MNT_EXPORTED | MNT_SUIDDIR)) { if ((error = suser(td)) != 0) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for * unprivileged users. */ if (suser(td) != 0) fsflags |= MNT_NOSUID | MNT_USER; /* * Get vnode to be covered */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (fsflags & MNT_UPDATE) { if ((vp->v_vflag & VV_ROOT) == 0) { vput(vp); return (EINVAL); } mp = vp->v_mount; flag = mp->mnt_flag; kern_flag = mp->mnt_kern_flag; /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ if ((fsflags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error) { vput(vp); return (error); } if (vfs_busy(mp, LK_NOWAIT, 0, td)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp, td); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS); VOP_UNLOCK(vp, 0, td); mp->mnt_optnew = fsdata; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); } else { /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred, td); if (error) { vput(vp); return (error); } if (va.va_uid != td->td_ucred->cr_uid) { if ((error = suser(td)) != 0) { vput(vp); return (error); } } error = vinvalbuf(vp, V_SAVE, td, 0, 0); if (error != 0) { vput(vp); return (error); } if (vp->v_type != VDIR) { vput(vp); return (ENOTDIR); } vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) { vput(vp); return (error); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); /* * Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td); VOP_UNLOCK(vp, 0, td); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = fsdata; } /* * Set the mount level flags. */ if (fsflags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; mp->mnt_flag &=~ MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, td); if (!error) { if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; VFS_STATFS(mp, &mp->mnt_stat, td); } /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if (mp->mnt_flag & MNT_UPDATE) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); if (error) { mp->mnt_flag = flag; mp->mnt_kern_flag = kern_flag; } if ((mp->mnt_flag & MNT_RDONLY) == 0) { if (mp->mnt_syncer == NULL) error = vfs_allocate_syncvnode(mp); } else { if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); mp->mnt_syncer = NULL; } vfs_unbusy(mp, td); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* * Put the new filesystem on the mount list after root. */ cache_purge(vp); if (!error) { struct vnode *newdp; VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td)) panic("mount: lost mount"); mountcheckdirs(vp, newdp); vput(newdp); VOP_UNLOCK(vp, 0, td); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp, td); if (error) vrele(vp); } else { VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vfs_mount_destroy(mp, td); vput(vp); } return (error); } /* * --------------------------------------------------------------------- * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, * not special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int unmount(td, uap) struct thread *td; register struct unmount_args /* { char *path; int flags; } */ *uap; { struct mount *mp; char *pathbuf; int error, id0, id1; if (jailed(td->td_ucred)) return (EPERM); if (usermount == 0) { if ((error = suser(td)) != 0) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (uap->flags & MNT_BYFSID) { /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) break; } mtx_unlock(&mountlist_mtx); } else { mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) break; } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error) return (error); /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); mtx_lock(&Giant); error = dounmount(mp, uap->flags, td); mtx_unlock(&Giant); return (error); } /* * Do the actual filesystem unmount. */ int dounmount(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct vnode *coveredvp, *fsrootvp; int error; int async_flag; mtx_assert(&Giant, MA_OWNED); MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { MNT_IUNLOCK(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT; /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) mp->mnt_kern_flag |= MNTK_UNMOUNTF; error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td); if (error) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); MNT_IUNLOCK(mp); return (error); } vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_msync(mp, MNT_WAIT); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; cache_purgevfs(mp); /* remove cache entries for this file sys */ if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); /* * For forced unmounts, move process cdir/rdir refs on the fs root * vnode to the covered vnode. For non-forced unmounts we want * such references to cause an EBUSY error. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) || (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, td); } vn_finished_write(mp); if (error) { /* Undo cdir/rdir and rootvnode changes made above. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) (void) vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); MNT_IUNLOCK(mp); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); if ((coveredvp = mp->mnt_vnodecovered) != NULL) coveredvp->v_mountedhere = NULL; mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_UNMOUNT, 0); vfs_mount_destroy(mp, td); if (coveredvp != NULL) vrele(coveredvp); return (0); } /* * --------------------------------------------------------------------- * Mounting of root filesystem * */ struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(&root_holds); struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&mountlist_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&mountlist_mtx); return (h); } void root_mount_rel(struct root_hold_token *h) { mtx_lock(&mountlist_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&mountlist_mtx); free(h, M_DEVBUF); } static void root_mount_wait(void) { struct root_hold_token *h; for (;;) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&mountlist_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&mountlist_mtx); break; } printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold", hz); } } static void set_rootvnode(struct thread *td) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td)) panic("Cannot find root vnode"); p = td->td_proc; FILEDESC_LOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_UNLOCK(p->p_fd); VOP_UNLOCK(rootvnode, 0, td); } /* * Mount /devfs as our root filesystem, but do not put it on the mountlist * yet. Create a /dev -> / symlink so that absolute pathnames will lookup. */ static void devfs_first(void) { struct thread *td = curthread; struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp = NULL; int error; vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return; mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td); error = VFS_MOUNT(mp, td); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); set_rootvnode(td); error = kern_symlink(td, "/", "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); } /* * Surgically move our devfs to be mounted on /dev. */ static void devfs_fixup(struct thread *td) { struct nameidata nd; int error; struct vnode *vp, *dvp; struct mount *mp; /* Remove our devfs mount from the mountlist and purge the cache */ mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mp); VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td); VI_LOCK(dvp); dvp->v_iflag &= ~VI_MOUNT; dvp->v_mountedhere = NULL; VI_UNLOCK(dvp); /* Set up the real rootvnode, and purge the cache */ TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL; set_rootvnode(td); cache_purgevfs(rootvnode->v_mount); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (error) { printf("Lookup of /dev for devfs, error: %d\n", error); return; } NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type != VDIR) { vput(vp); } error = vinvalbuf(vp, V_SAVE, td, 0, 0); if (error) { vput(vp); } cache_purge(vp); mp->mnt_vnodecovered = vp; vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); VOP_UNLOCK(vp, 0, td); vfs_unbusy(mp, td); vput(dvp); /* Unlink the no longer needed /dev/dev -> / symlink */ kern_unlink(td, "/dev/dev", UIO_SYSSPACE); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * Find and mount the root filesystem */ void vfs_mountroot(void) { char *cp; int error, i, asked = 0; root_mount_wait(); devfs_first(); /* * We are booted with instructions to prompt for the root filesystem. */ if (boothowto & RB_ASKNAME) { if (!vfs_mountroot_ask()) return; asked = 1; } /* * The root filesystem information is compiled in, and we are * booted with instructions to use it. */ if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) { if (!vfs_mountroot_try(ctrootdevname)) return; ctrootdevname = NULL; } /* * We've been given the generic "use CDROM as root" flag. This is * necessary because one media may be used in many different * devices, so we need to search for them. */ if (boothowto & RB_CDROM) { for (i = 0; cdrom_rootdevnames[i] != NULL; i++) { if (!vfs_mountroot_try(cdrom_rootdevnames[i])) return; } } /* * Try to use the value read by the loader from /etc/fstab, or * supplied via some other means. This is the preferred * mechanism. */ cp = getenv("vfs.root.mountfrom"); if (cp != NULL) { error = vfs_mountroot_try(cp); freeenv(cp); if (!error) return; } /* * Try values that may have been computed by code during boot */ if (!vfs_mountroot_try(rootdevnames[0])) return; if (!vfs_mountroot_try(rootdevnames[1])) return; /* * If we (still) have a compiled-in default, try it. */ if (ctrootdevname != NULL) if (!vfs_mountroot_try(ctrootdevname)) return; /* * Everything so far has failed, prompt on the console if we haven't * already tried that. */ if (!asked) if (!vfs_mountroot_ask()) return; panic("Root mount failed, startup aborted."); } /* * Mount (mountfrom) as the root filesystem. */ static int vfs_mountroot_try(const char *mountfrom) { struct mount *mp; char *vfsname, *path; time_t timebase; int error; char patt[32]; vfsname = NULL; path = NULL; mp = NULL; error = EINVAL; if (mountfrom == NULL) return (error); /* don't complain */ printf("Trying to mount root from %s\n", mountfrom); /* parse vfs name and path */ vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK); path = malloc(MNAMELEN, M_MOUNT, M_WAITOK); vfsname[0] = path[0] = 0; sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN); if (sscanf(mountfrom, patt, vfsname, path) < 1) goto out; if (path[0] == '\0') strcpy(path, ROOTNAME); error = kernel_vmount( MNT_RDONLY | MNT_ROOTFS, "fstype", vfsname, "fspath", "/", "from", path, NULL); if (error == 0) { /* * We mount devfs prior to mounting the / FS, so the first * entry will typically be devfs. */ mp = TAILQ_FIRST(&mountlist); KASSERT(mp != NULL, ("%s: mountlist is empty", __func__)); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Typically devfs has no time stamp and the only other FS * is the actual / FS. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; do { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } while (mp != NULL); inittodr(timebase); devfs_fixup(curthread); } out: free(path, M_MOUNT); free(vfsname, M_MOUNT); return (error); } /* * --------------------------------------------------------------------- * Interactive root filesystem selection code. */ static int vfs_mountroot_ask(void) { char name[128]; for(;;) { printf("\nManual root filesystem specification:\n"); printf(" : Mount using filesystem \n"); #if defined(__i386__) || defined(__ia64__) printf(" eg. ufs:da0s1a\n"); #else printf(" eg. ufs:/dev/da0a\n"); #endif printf(" ? List valid disk boot devices\n"); printf(" Abort manual input\n"); printf("\nmountroot> "); gets(name, sizeof(name), 1); if (name[0] == '\0') return (1); if (name[0] == '?') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (!vfs_mountroot_try(name)) return (0); } } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; const char **t, *p; TAILQ_FOREACH(opt, opts, link) { p = opt->name; if (p[0] == 'n' && p[1] == 'o') p += 2; for(t = global_opts; *t != NULL; t++) if (!strcmp(*t, p)) break; if (*t != NULL) continue; for(t = legal; *t != NULL; t++) if (!strcmp(*t, p)) break; if (*t != NULL) continue; printf("mount option <%s> is unknown\n", p); return (EINVAL); } return (0); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(opts, name, buf, len) struct vfsoptlist *opts; const char *name; void **buf; int *len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; int i; if (opts == NULL) return (-1); i = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) return (i); ++i; } return (-1); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(opts, name, dest, len) struct vfsoptlist *opts; const char *name; void *dest; int len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } /* * This is a helper function for filesystems to traverse their * vnodes. See MNT_VNODE_FOREACH() in sys/mount.h */ struct vnode * -__mnt_vnode_next(struct vnode **nvp, struct mount *mp) +__mnt_vnode_next(struct vnode **mvp, struct mount *mp) { struct vnode *vp; - mtx_assert(&mp->mnt_mtx, MA_OWNED); + mtx_assert(MNT_MTX(mp), MA_OWNED); - vp = *nvp; + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + vp = TAILQ_NEXT(*mvp, v_nmntvnodes); + while (vp != NULL && vp->v_type == VMARKER) + vp = TAILQ_NEXT(vp, v_nmntvnodes); + /* Check if we are done */ - if (vp == NULL) + if (vp == NULL) { + __mnt_vnode_markerfree(mvp, mp); return (NULL); - /* If our next vnode is no longer ours, start over */ - if (vp->v_mount != mp) - vp = TAILQ_FIRST(&mp->mnt_nvnodelist); - /* Save pointer to next vnode in list */ - if (vp != NULL) - *nvp = TAILQ_NEXT(vp, v_nmntvnodes); - else - *nvp = NULL; + } + TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); + TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); return (vp); } + +struct vnode * +__mnt_vnode_first(struct vnode **mvp, struct mount *mp) +{ + struct vnode *vp; + + mtx_assert(MNT_MTX(mp), MA_OWNED); + + vp = TAILQ_FIRST(&mp->mnt_nvnodelist); + while (vp != NULL && vp->v_type == VMARKER) + vp = TAILQ_NEXT(vp, v_nmntvnodes); + + /* Check if we are done */ + if (vp == NULL) { + *mvp = NULL; + return (NULL); + } + mp->mnt_holdcnt++; + MNT_IUNLOCK(mp); + *mvp = (struct vnode *) malloc(sizeof(struct vnode), + M_VNODE_MARKER, + M_WAITOK | M_ZERO); + MNT_ILOCK(mp); + (*mvp)->v_type = VMARKER; + + vp = TAILQ_FIRST(&mp->mnt_nvnodelist); + while (vp != NULL && vp->v_type == VMARKER) + vp = TAILQ_NEXT(vp, v_nmntvnodes); + + /* Check if we are done */ + if (vp == NULL) { + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + MNT_ILOCK(mp); + *mvp = NULL; + mp->mnt_holdcnt--; + if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0) + wakeup(&mp->mnt_holdcnt); + return (NULL); + } + mp->mnt_markercnt++; + (*mvp)->v_mount = mp; + TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); + return (vp); +} + + +void +__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp) +{ + + if (*mvp == NULL) + return; + + mtx_assert(MNT_MTX(mp), MA_OWNED); + + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + MNT_ILOCK(mp); + *mvp = NULL; + + mp->mnt_markercnt--; + mp->mnt_holdcnt--; + if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0) + wakeup(&mp->mnt_holdcnt); +} + int __vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { int error; error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td); if (sbp != &mp->mnt_stat) *sbp = mp->mnt_stat; return (error); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, use printf. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, int flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, -1); } va_end(ap); error = kernel_mount(ma, flags); return (error); } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 154151) +++ head/sys/kern/vfs_subr.c (revision 154152) @@ -1,3924 +1,3919 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure"); static void delmntque(struct vnode *vp); static void insmntque(struct vnode *vp, struct mount *mp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void vbusy(struct vnode *vp); static void vdropl(struct vnode *vp); static void vinactive(struct vnode *, struct thread *); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void vfree(struct vnode *); static void vnlru_free(int); static void vdestroy(struct vnode *); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static int vfs_knllocked(void *arg); /* * Enable Giant pushdown based on whether or not the vm is mpsafe in this * build. Without mpsafevm the buffer cache can not run Giant free. */ #if defined(__alpha__) || defined(__amd64__) || defined(__i386__) || \ defined(__ia64__) || defined(__sparc64__) int mpsafe_vfs = 1; #else int mpsafe_vfs; #endif TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, "MPSAFE VFS"); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, never decreased. */ static unsigned long numvnodes; SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; -int vttoif_tab[9] = { +int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, - S_IFSOCK, S_IFIFO, S_IFMT, + S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* Hook for calling soft updates. */ int (*softdep_process_worklist_hook)(struct mount *); /* * Macros to control when a vnode is freed and recycled. All require * the vnode interlock. */ #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) /* * Initialize the vnode management data structures. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX 100000 #endif static void vntblinit(void *dummy __unused) { /* * Desiredvnodes is a function of the physical memory size and * the kernel's heap size. Specifically, desiredvnodes scales * in proportion to the physical memory size until two fifths * of the kernel's heap size is consumed by vnodes and vm * objects. */ desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, td) struct mount *mp; int flags; struct mtx *interlkp; struct thread *td; { int lkflags; MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) { MNT_IUNLOCK(mp); return (ENOENT); } if (interlkp) mtx_unlock(interlkp); mp->mnt_kern_flag |= MNTK_MWAIT; /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0); if (interlkp) mtx_lock(interlkp); return (ENOENT); } if (interlkp) mtx_unlock(interlkp); lkflags = LK_SHARED | LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, td) struct mount *mp; struct thread *td; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access priveledged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if ((mp->mnt_flag & MNT_USER) == 0 || mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = suser(td)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(mp) struct mount *mp; { static u_int16_t mntid_base; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if (vfs_getvfs(&tfsid) == NULL) break; } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(tsp) struct timespec *tsp; { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct thread *td; struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = cnt.v_page_count * 2 / usevnodes; done = 0; td = curthread; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; - while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { + while (count != 0) { + vp = TAILQ_FIRST(&mp->mnt_nvnodelist); + while (vp != NULL && vp->v_type == VMARKER) + vp = TAILQ_NEXT(vp, v_nmntvnodes); + if (vp == NULL) + break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK, td); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); vgonel(vp); VOP_UNLOCK(vp, 0, td); vdropl(vp); done++; next_iter_mntunlocked: if ((count % 256) != 0) goto relock_mnt; goto yield; next_iter: if ((count % 256) != 0) continue; MNT_IUNLOCK(mp); yield: uio_yield(); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); continue; } VNASSERT(VCANRECYCLE(vp), vp, ("vp inconsistent on freelist")); freevnodes--; vp->v_iflag &= ~VI_FREE; vholdl(vp); mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vtryrecycle(vp); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done; struct proc *p = vnlruproc; struct thread *td = FIRST_THREAD_IN_PROC(p); mtx_lock(&Giant); EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); for (;;) { kthread_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { int vfsunlocked; if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (!VFS_NEEDSGIANT(mp)) { mtx_unlock(&Giant); vfsunlocked = 1; } else vfsunlocked = 0; done += vlrureclaim(mp); if (vfsunlocked) mtx_lock(&Giant); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (done == 0) { #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else uio_yield(); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) /* * Routines having to do with the management of the vnode table. */ static void vdestroy(struct vnode *vp) { struct bufobj *bo; CTR1(KTR_VFS, "vdestroy vp %p", vp); mtx_lock(&vnode_free_list_mtx); numvnodes--; mtx_unlock(&vnode_free_list_mtx); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VI_UNLOCK(vp); #ifdef MAC mac_destroy_vnode(vp); #endif if (vp->v_pollinfo != NULL) { knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note); mtx_destroy(&vp->v_pollinfo->vpi_lock); uma_zfree(vnodepoll_zone, vp->v_pollinfo); } #ifdef INVARIANTS /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); uma_zfree(vnode_zone, vp); } /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct thread *td = curthread; struct mount *vnmp; CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0) return (EWOULDBLOCK); /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0, td); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK, td); vn_finished_write(vnmp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) vgonel(vp); VOP_UNLOCK(vp, LK_INTERLOCK, td); vn_finished_write(vnmp); CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp); return (0); } /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) const char *tag; struct mount *mp; struct vop_vector *vops; struct vnode **vpp; { struct vnode *vp = NULL; struct bufobj *bo; mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); /* * Wait for available vnodes. */ if (numvnodes > desiredvnodes) { if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (numvnodes > desiredvnodes) { mtx_unlock(&vnode_free_list_mtx); return (ENFILE); } #endif } numvnodes++; mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; bo->bo_mtx = &vp->v_interlock; bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = 0; #ifdef MAC mac_init_vnode(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_associate_vnode_singlelabel(mp, vp); else if (mp == NULL) printf("NULL mp in getnewvnode()\n"); #endif delmntque(vp); if (mp != NULL) { insmntque(vp, mp); bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp); *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; if (vp->v_mount == NULL) return; mp = vp->v_mount; MNT_ILOCK(mp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_IUNLOCK(mp); } /* * Insert into list of vnodes for the new mount point, if available. */ static void insmntque(struct vnode *vp, struct mount *mp) { vp->v_mount = mp; VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); MNT_ILOCK(vp->v_mount); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, + ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; MNT_IUNLOCK(vp->v_mount); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_UNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); VM_OBJECT_UNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo) { CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(bufv, flags, bo, slpflag, slptimeo) struct bufv *bufv; int flags; struct bufobj *bo; int slpflag, slptimeo; { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_LOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: VI_LOCK(vp); bo = &vp->v_bufobj; anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } VI_LOCK(vp); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } VI_LOCK(vp); } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); VI_LOCK(vp); goto restartsync; } } bufobj_wwait(bo, 0, 0); VI_UNLOCK(vp); vnode_pager_setsize(vp, length); return (0); } /* * buf_splay() - splay tree core for the clean/dirty list of buffers in * a vnode. * * NOTE: We have to deal with the special case of a background bitmap * buffer, a situation where two buffers will have the same logical * block offset. We want (1) only the foreground buffer to be accessed * in a lookup and (2) must differentiate between the foreground and * background buffer in the splay tree algorithm because the splay * tree cannot normally handle multiple entities with the same 'index'. * We accomplish this by adding differentiating flags to the splay tree's * numerical domain. */ static struct buf * buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) { struct buf dummy; struct buf *lefttreemax, *righttreemin, *y; if (root == NULL) return (NULL); lefttreemax = righttreemin = &dummy; for (;;) { if (lblkno < root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_left) == NULL) break; if (lblkno < y->b_lblkno) { /* Rotate right. */ root->b_left = y->b_right; y->b_right = root; root = y; if ((y = root->b_left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->b_left = root; righttreemin = root; } else if (lblkno > root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_right) == NULL) break; if (lblkno > y->b_lblkno) { /* Rotate left. */ root->b_right = y->b_left; y->b_left = root; root = y; if ((y = root->b_right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->b_right = root; lefttreemax = root; } else { break; } root = y; } /* Assemble the new root. */ lefttreemax->b_right = root->b_left; righttreemin->b_left = root->b_right; root->b_left = dummy.b_right; root->b_right = dummy.b_left; return (root); } static void buf_vlist_remove(struct buf *bp) { struct buf *root; struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_LOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; if (bp != bv->bv_root) { root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); KASSERT(root == bp, ("splay lookup failed in remove")); } if (bp->b_left == NULL) { root = bp->b_right; } else { root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); root->b_right = bp->b_right; } bv->bv_root = root; TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list using a * splay tree algorithm. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct buf *root; struct bufv *bv; ASSERT_BO_LOCKED(bo); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); if (root == NULL) { bp->b_left = NULL; bp->b_right = NULL; TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); } else if (bp->b_lblkno < root->b_lblkno || (bp->b_lblkno == root->b_lblkno && (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { bp->b_left = root->b_left; bp->b_right = root; root->b_left = NULL; TAILQ_INSERT_BEFORE(root, bp, b_bobufs); } else { bp->b_right = root->b_right; bp->b_left = root; root->b_right = NULL; TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); } bv->bv_cnt++; bv->bv_root = bp; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_clean.bv_root) != NULL) { bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } if ((bp = bo->bo_dirty.bv_root) != NULL) { bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } return (NULL); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); ASSERT_VI_LOCKED(vp, "bgetvp"); vholdl(vp); bp->b_vp = vp; bp->b_bufobj = &vp->v_bufobj; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; vdropl(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_LOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) static int sync_vnode(struct bufobj *bo, struct thread *td) { struct vnode *vp; struct mount *mp; vp = bo->__bo_vnode; /* XXX */ if (VOP_ISLOCKED(vp, NULL) != 0) return (1); if (VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (1); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VI_LOCK(vp); if ((bo->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(bo, syncdelay); } vdropl(vp); mtx_lock(&sync_mtx); return (0); } /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next; struct synclist *slp; struct bufobj *bo; long starttime; struct thread *td = FIRST_THREAD_IN_PROC(updateproc); static int dummychan; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; int error; mtx_lock(&Giant); last_work_seen = 0; syncer_final_iter = 0; first_printf = 1; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); for (;;) { mtx_lock(&sync_mtx); if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kthread_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while ((bo = LIST_FIRST(slp)) != NULL) { error = sync_vnode(bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; mtx_unlock(&sync_mtx); /* * Do soft update processing. */ if (softdep_process_worklist_hook != NULL) (*softdep_process_worklist_hook)(NULL); /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ mtx_lock(&sync_mtx); if (rushjob > 0) { rushjob -= 1; mtx_unlock(&sync_mtx); continue; } mtx_unlock(&sync_mtx); /* * Just sleep for a short period if time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING) tsleep(&dummychan, PPAUSE, "syncfnl", hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { struct thread *td; int ret = 0; td = FIRST_THREAD_IN_PROC(updateproc); sleepq_remove(td, &lbolt); mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { struct thread *td; if (howto & RB_NOSYNC) return; td = FIRST_THREAD_IN_PROC(updateproc); sleepq_remove(td, &lbolt); mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); kproc_shutdown(arg, howto); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ VI_LOCK(vp); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif VI_UNLOCK(vp); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } vholdl(vp); } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new filesystem type). */ int vget(vp, flags, td) struct vnode *vp; int flags; struct thread *td; { int oweinact; int oldflags; int usecount; int error; error = 0; oldflags = flags; oweinact = 0; if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); /* * If the inactive call was deferred because vput() was called * with a shared lock, we have to do it here before another thread * gets a reference to data that should be dead. */ if (vp->v_iflag & VI_OWEINACT) { if (flags & LK_NOWAIT) { VI_UNLOCK(vp); return (EBUSY); } flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; oweinact = 1; } usecount = vp->v_usecount; v_incr_usecount(vp); if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { VI_LOCK(vp); /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. */ v_decr_usecount(vp); /* * Print warning when race below occur: * * thread1 thread2 * ------- ------- * v_usecount=0 * vref(vp) v_usecount=1 * vget(vp) * v_incr_usecount(vp) v_usecount=2 * vn_lock(vp) * vrele(vp) v_usecount=1 * v_decr_usecount(vp) v_usecount=0 * * In such situation VOP_INACTIVE() will not be called for * the vnode vp. */ if (usecount > 0 && vp->v_usecount == 0) printf("vinactive() won't be called for vp=%p\n", vp); return (error); } if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); if (oweinact) { VI_LOCK(vp); if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VI_UNLOCK(vp); if ((oldflags & LK_TYPE_MASK) == 0) VOP_UNLOCK(vp, 0, td); } return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct thread *td = curthread; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vrele: missed vn_close")); if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); #endif VI_UNLOCK(vp); panic("vrele: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) { VI_LOCK(vp); vinactive(vp, td); VOP_UNLOCK(vp, 0, td); } else VI_LOCK(vp); vdropl(vp); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() aquires the lock internally.) */ void vput(vp) struct vnode *vp; { struct thread *td = curthread; /* XXX */ int error; KASSERT(vp != NULL, ("vput: null vp")); ASSERT_VOP_LOCKED(vp, "vput"); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vput: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { VOP_UNLOCK(vp, 0, td); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); vp->v_iflag |= VI_OWEINACT; if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td); VI_LOCK(vp); if (error) goto done; } if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0, td); done: vdropl(vp); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } void vholdl(struct vnode *vp) { vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we will free it if it has been vgone'd otherwise it is * placed on the free list. */ static void vdropl(struct vnode *vp) { if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; if (vp->v_holdcnt == 0) { if (vp->v_iflag & VI_DOOMED) { vdestroy(vp); return; } else vfree(vp); } VI_UNLOCK(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ static void vinactive(struct vnode *vp, struct thread *td) { ASSERT_VOP_LOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, rootrefs, flags, td) struct mount *mp; int rootrefs; int flags; struct thread *td; { - struct vnode *vp, *nvp, *rootvp = NULL; + struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR1(KTR_VFS, "vflush: mp %p", mp); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0) return (error); vput(rootvp); } MNT_ILOCK(mp); loop: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); vholdl(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td); if (error) { vdrop(vp); MNT_ILOCK(mp); + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0, td); vdrop(vp); MNT_ILOCK(mp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0, td); vdropl(vp); MNT_ILOCK(mp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0, td); vdropl(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td); vgone(rootvp); VOP_UNLOCK(rootvp, 0, td); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) return (EBUSY); for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp, struct thread *td) { int recycled; ASSERT_VOP_LOCKED(vp, "vrecycle"); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; CTR1(KTR_VFS, "vgonel: vp %p", vp); ASSERT_VOP_LOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); #if 0 /* XXX Need to fix ttyvp before I enable this. */ VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); #endif td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_write_suspend_wait(vp, NULL, V_WAIT); if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0) vinvalbuf(vp, 0, td, 0, 0); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(vp) struct vnode *vp; { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(dev) struct cdev *dev; { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = -{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; +{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", + "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[96]; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strcat(buf, "|VV_ROOT"); if (vp->v_vflag & VV_TEXT) strcat(buf, "|VV_TEXT"); if (vp->v_vflag & VV_SYSTEM) strcat(buf, "|VV_SYSTEM"); if (vp->v_iflag & VI_DOOMED) strcat(buf, "|VI_DOOMED"); if (vp->v_iflag & VI_FREE) strcat(buf, "|VI_FREE"); printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count); printf(" "); lockmgr_printinfo(vp->v_vnlock); printf("\n"); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp, *nmp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ printf("Locked vnodes\n"); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (VOP_ISLOCKED(vp, NULL)) vprint("", vp); } nmp = TAILQ_NEXT(mp, mnt_list); } } #endif /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static void vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) { strcpy(xvfsp->vfc_name, vfsp->vfc_name); xvfsp->vfc_typenum = vfsp->vfc_typenum; xvfsp->vfc_refcount = vfsp->vfc_refcount; xvfsp->vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp->vfc_vfsops = NULL; xvfsp->vfc_next = NULL; } /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; struct xvfsconf xvfsp; int error; error = 0; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); if (error) break; } return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; struct xvfsconf xvfsp; printf("WARNING: userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct thread *td = req->td; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp, td); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp; struct thread *td; int error; KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { - struct vnode *vp, *nvp; + struct vnode *vp, *mvp; struct vm_object *obj; - int tries; - tries = 5; MNT_ILOCK(mp); -loop: - TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) { - if (vp->v_mount != mp) { - if (--tries > 0) - goto loop; - break; - } - + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OBJDIRTY) && (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { MNT_IUNLOCK(mp); if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); MNT_ILOCK(mp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_UNLOCK(obj); } vput(vp); } MNT_ILOCK(mp); - if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { - if (--tries > 0) - goto loop; - break; - } } else VI_UNLOCK(vp); } MNT_IUNLOCK(mp); } /* * Mark a vnode as free, putting it up for recycling. */ static void vfree(struct vnode *vp) { CTR1(KTR_VFS, "vfree vp %p", vp); ASSERT_VI_LOCKED(vp, "vfree"); mtx_lock(&vnode_free_list_mtx); VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, ("vfree: Freeing doomed vnode")); if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); } /* * Opposite of vfree() - mark a vnode as in use. */ static void vbusy(struct vnode *vp) { CTR1(KTR_VFS, "vbusy vp %p", vp); ASSERT_VI_LOCKED(vp, "vbusy"); VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed.")); mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); mtx_unlock(&vnode_free_list_mtx); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); if (vp->v_pollinfo != NULL) { uma_zfree(vnodepoll_zone, vi); return; } vp->v_pollinfo = vi; mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knllocked); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, td, events) struct vnode *vp; struct thread *td; short events; { if (vp->v_pollinfo == NULL) v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return events; } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return 0; } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } VI_LOCK(vp); vn_syncer_add_to_worklist(&vp->v_bufobj, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; mtx_unlock(&sync_mtx); VI_UNLOCK(vp); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct thread *td = ap->a_td; int error, asyncflag; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp, td); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY, td); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vn_finished_write(mp); vfs_unbusy(mp, td); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct bufobj *bo; VI_LOCK(vp); bo = &vp->v_bufobj; vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } VI_UNLOCK(vp); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(vp, errp) struct vnode *vp; int *errp; { int error; error = 0; dev_lock(); if (vp->v_type != VCHR) error = ENOTBLK; else if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. */ int vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) enum vtype type; mode_t file_mode; uid_t file_uid; gid_t file_gid; mode_t acc_mode; struct ucred *cred; int *privused; { mode_t dac_granted; #ifdef CAPABILITIES mode_t cap_granted; #endif /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((acc_mode & dac_granted) == acc_mode) return (0); privcheck: if (!suser_cred(cred, SUSER_ALLOWJAIL)) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #ifdef CAPABILITIES /* * Build a capability mask to determine if the set of capabilities * satisfies the requirements when combined with the granted mask * from above. * For each capability, if the capability is required, bitwise * or the request type onto the cap_granted mask. */ cap_granted = 0; if (type == VDIR) { /* * For directories, use CAP_DAC_READ_SEARCH to satisfy * VEXEC requests, instead of CAP_DAC_EXECUTE. */ if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) cap_granted |= VEXEC; } else { if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL)) cap_granted |= VEXEC; } if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL)) cap_granted |= (VWRITE | VAPPEND); if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL)) cap_granted |= VADMIN; if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #endif return ((acc_mode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, int access) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly * manipulate system attributes. * * XXX What capability should apply here? * Probably CAP_SYS_SETFFLAG. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (suser_cred(cred, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, access, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter("lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, curthread) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp != a->a_fdvp) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp != a->a_fvp) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (BUF_REFCNT(bp) < 1) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter("lock violation"); } #endif } void vop_lookup_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); #endif } void vop_lookup_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; struct vnode *vp; a = ap; dvp = a->a_dvp; vp = *(a->a_vpp); ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); if (!rc) ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lock_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lock_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init(&fs_knlist, NULL, NULL, NULL, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { return (EINVAL); } VCTLTOREQ(&vc, req); return (VFS_SYSCTL(mp, vc.vc_op, req)); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { 1, NULL, filt_vfsdetach, filt_vfsread }; static struct filterops vfswrite_filtops = { 1, NULL, filt_vfsdetach, filt_vfswrite }; static struct filterops vfsvnode_filtops = { 1, NULL, filt_vfsdetach, filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0, curthread); } static int vfs_knllocked(void *arg) { struct vnode *vp = arg; return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE); } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; if (vp->v_pollinfo == NULL) v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) return (0); kn->kn_data = va.va_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_fflags != 0); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VA_MARK_ATIME. This functionality is used by execve * and mmap, so we want to avoid the synchronous I/O implied by * directly setting va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct thread *td) { struct vattr atimeattr; if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) { VATTR_NULL(&atimeattr); atimeattr.va_vaflags |= VA_MARK_ATIME; (void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td); } } Index: head/sys/nfs4client/nfs4_vfsops.c =================================================================== --- head/sys/nfs4client/nfs4_vfsops.c (revision 154151) +++ head/sys/nfs4client/nfs4_vfsops.c (revision 154152) @@ -1,872 +1,873 @@ /* $FreeBSD$ */ /* $Id: nfs_vfsops.c,v 1.38 2003/11/05 14:59:01 rees Exp $ */ /*- * copyright (c) 2003 * the regents of the university of michigan * all rights reserved * * permission is granted to use, copy, create derivative works and redistribute * this software and such derivative works for any purpose, so long as the name * of the university of michigan is not used in any advertising or publicity * pertaining to the use or distribution of this software without specific, * written prior authorization. if the above copyright notice or any other * identification of the university of michigan is included in any copy of any * portion of this software, then the disclaimer below must also be included. * * this software is provided as is, without representation from the university * of michigan as to its fitness for any purpose, and without warranty by the * university of michigan of any kind, either express or implied, including * without limitation the implied warranties of merchantability and fitness for * a particular purpose. the regents of the university of michigan shall not be * liable for any damages, including special, indirect, incidental, or * consequential damages, with respect to any claim arising out of or in * connection with the use of the software, even if it has been or is hereafter * advised of the possibility of such damages. */ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vfsops.c 8.12 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_nfsroot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_vfs, OID_AUTO, nfs4, CTLFLAG_RW, 0, "NFS4 filesystem"); SYSCTL_STRUCT(_vfs_nfs4, NFS_NFSSTATS, nfsstats, CTLFLAG_RD, &nfsstats, nfsstats, "S,nfsstats"); static void nfs_decode_args(struct nfsmount *nmp, struct nfs_args *argp); static void nfs4_daemon(void *arg); static int mountnfs(struct nfs_args *, struct mount *, struct sockaddr *, char *, struct vnode **, struct ucred *cred); static int nfs4_do_setclientid(struct nfsmount *nmp, struct ucred *cred); static vfs_mount_t nfs_mount; static vfs_cmount_t nfs_cmount; static vfs_unmount_t nfs_unmount; static vfs_root_t nfs_root; static vfs_statfs_t nfs_statfs; static vfs_sync_t nfs_sync; /* * nfs vfs operations. */ static struct vfsops nfs_vfsops = { .vfs_init = nfs4_init, .vfs_mount = nfs_mount, .vfs_cmount = nfs_cmount, .vfs_root = nfs_root, .vfs_statfs = nfs_statfs, .vfs_sync = nfs_sync, .vfs_uninit = nfs4_uninit, .vfs_unmount = nfs_unmount, }; VFS_SET(nfs_vfsops, nfs4, VFCF_NETWORK); static struct nfs_rpcops nfs4_rpcops = { nfs4_readrpc, nfs4_writerpc, nfs4_writebp, nfs4_readlinkrpc, nfs4_invaldir, nfs4_commit, }; /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfs4, 1); void nfsargs_ntoh(struct nfs_args *); int nfs4_init(struct vfsconf *vfsp) { rpcclnt_init(); nfs4dev_init(); idmap_init(); nfsm_v4init(); return (0); } int nfs4_uninit(struct vfsconf *vfsp) { rpcclnt_uninit(); nfs4dev_uninit(); idmap_uninit(); return (0); } /* * nfs statfs call */ static int nfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { struct vnode *vp; struct nfs_statfs *sfp; caddr_t bpos, dpos; struct nfsmount *nmp = VFSTONFS(mp); int error = 0; struct mbuf *mreq, *mrep = NULL, *md, *mb; struct nfsnode *np; struct nfs4_compound cp; struct nfs4_oparg_getattr ga; struct nfsv4_fattr *fap = &ga.fa; #ifndef nolint sfp = NULL; #endif error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); if (error) return (error); vp = NFSTOV(np); nfsstats.rpccnt[NFSPROC_FSSTAT]++; mreq = nfsm_reqhead(vp, NFSV4PROC_COMPOUND, NFSX_FH(1)); mb = mreq; bpos = mtod(mb, caddr_t); ga.bm = &nfsv4_fsattrbm; nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "statfs()"); nfsm_v4build_putfh(&cp, vp); nfsm_v4build_getattr(&cp, &ga); nfsm_v4build_finalize(&cp); nfsm_request(vp, NFSV4PROC_COMPOUND, td, td->td_ucred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putfh(&cp); nfsm_v4dissect_getattr(&cp, &ga); nfs4_vfsop_statfs(fap, sbp, mp); nfsmout: error = nfs_v4postop(&cp, error); vput(vp); if (mrep != NULL) m_freem(mrep); return (error); } static void nfs_decode_args(struct nfsmount *nmp, struct nfs_args *argp) { int s; int adjsock; int maxio; s = splnet(); /* * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes * no sense in that context. */ if (argp->sotype == SOCK_STREAM) nmp->nm_flag &= ~NFSMNT_NOCONN; nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* Re-bind if rsrvd port requested and wasn't on one */ adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT) && (argp->flags & NFSMNT_RESVPORT); /* Also re-bind if we're switching to/from a connected UDP socket */ adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) != (argp->flags & NFSMNT_NOCONN)); /* Update flags atomically. Don't change the lock bits. */ nmp->nm_flag = argp->flags | nmp->nm_flag; splx(s); if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; if (nmp->nm_timeo < NFS_MINTIMEO) nmp->nm_timeo = NFS_MINTIMEO; else if (nmp->nm_timeo > NFS_MAXTIMEO) nmp->nm_timeo = NFS_MAXTIMEO; } if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { nmp->nm_retry = argp->retrans; if (nmp->nm_retry > NFS_MAXREXMIT) nmp->nm_retry = NFS_MAXREXMIT; } if (argp->flags & NFSMNT_NFSV3) { if (argp->sotype == SOCK_DGRAM) maxio = NFS_MAXDGRAMDATA; else maxio = NFS_MAXDATA; } else maxio = NFS_V2MAXDATA; if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { nmp->nm_wsize = argp->wsize; /* Round down to multiple of blocksize */ nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize <= 0) nmp->nm_wsize = NFS_FABLKSIZE; } if (nmp->nm_wsize > maxio) nmp->nm_wsize = maxio; if (nmp->nm_wsize > MAXBSIZE) nmp->nm_wsize = MAXBSIZE; if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { nmp->nm_rsize = argp->rsize; /* Round down to multiple of blocksize */ nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize <= 0) nmp->nm_rsize = NFS_FABLKSIZE; } if (nmp->nm_rsize > maxio) nmp->nm_rsize = maxio; if (nmp->nm_rsize > MAXBSIZE) nmp->nm_rsize = MAXBSIZE; if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { nmp->nm_readdirsize = argp->readdirsize; } if (nmp->nm_readdirsize > maxio) nmp->nm_readdirsize = maxio; if (nmp->nm_readdirsize > nmp->nm_rsize) nmp->nm_readdirsize = nmp->nm_rsize; if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) nmp->nm_acregmin = argp->acregmin; else nmp->nm_acregmin = NFS_MINATTRTIMO; if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) nmp->nm_acregmax = argp->acregmax; else nmp->nm_acregmax = NFS_MAXATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) nmp->nm_acdirmin = argp->acdirmin; else nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) nmp->nm_acdirmax = argp->acdirmax; else nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; if (nmp->nm_acdirmin > nmp->nm_acdirmax) nmp->nm_acdirmin = nmp->nm_acdirmax; if (nmp->nm_acregmin > nmp->nm_acregmax) nmp->nm_acregmin = nmp->nm_acregmax; if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0) { if (argp->maxgrouplist <= NFS_MAXGRPS) nmp->nm_numgrps = argp->maxgrouplist; else nmp->nm_numgrps = NFS_MAXGRPS; } if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) { if (argp->readahead <= NFS_MAXRAHEAD) nmp->nm_readahead = argp->readahead; else nmp->nm_readahead = NFS_MAXRAHEAD; } if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) { if (argp->deadthresh <= NFS_MAXDEADTHRESH) nmp->nm_deadthresh = argp->deadthresh; else nmp->nm_deadthresh = NFS_MAXDEADTHRESH; } adjsock |= ((nmp->nm_sotype != argp->sotype) || (nmp->nm_soproto != argp->proto)); nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; if (nmp->nm_rpcclnt.rc_so && adjsock) { nfs_safedisconnect(nmp); if (nmp->nm_sotype == SOCK_DGRAM) { while (nfs4_connect(nmp)) { printf("nfs_args: retrying connect\n"); (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); } } } } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the sockargs() call because sockargs() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct nfs_args args; int error; error = copyin(data, (caddr_t)&args, sizeof (struct nfs_args)); if (error) return (error); ma = mount_arg(ma, "nfs_args", &args, sizeof args); error = kernel_mount(ma, flags); return (error); } static int nfs_mount(struct mount *mp, struct thread *td) { int error; struct nfs_args args; struct sockaddr *nam; struct vnode *vp; char hst[MNAMELEN]; size_t len; if (mp->mnt_flag & MNT_ROOTFS) { printf("NFSv4: nfs_mountroot not supported\n"); return EINVAL; } error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof args); if (error) return (error); if (args.version != NFS_ARGSVERSION) return (EPROGMISMATCH); if (mp->mnt_flag & MNT_UPDATE) { struct nfsmount *nmp = VFSTONFS(mp); if (nmp == NULL) return (EIO); /* * When doing an update, we can't change from or to * v3, switch lockd strategies or change cookie translation */ args.flags = (args.flags & ~(NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_NOLOCKD)) | (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_NOLOCKD)); nfs_decode_args(nmp, &args); return (0); } error = copyinstr(args.hostname, hst, MNAMELEN-1, &len); if (error) return (error); bzero(&hst[len], MNAMELEN - len); /* sockargs() call must be after above copyin() calls */ error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen); if (error) return (error); error = mountnfs(&args, mp, nam, hst, &vp, td->td_ucred); return (error); } /* * renew should be done async * should re-scan mount queue each time */ struct proc *nfs4_daemonproc; static int nfs4_do_renew(struct nfsmount *nmp, struct ucred *cred) { struct nfs4_compound cp; struct mbuf *mreq, *mrep = NULL, *md, *mb; caddr_t bpos, dpos; int error; mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, sizeof(uint64_t)); mb = mreq; bpos = mtod(mb, caddr_t); nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_do_renew()"); nfsm_v4build_renew(&cp, nmp->nm_clientid); nfsm_v4build_finalize(&cp); nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, curthread, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_renew(&cp); nmp->nm_last_renewal = time_second; return (0); nfsmout: error = nfs_v4postop(&cp, error); /* XXX */ if (mrep != NULL) m_freem(mrep); return (error); } static void nfs4_daemon(void *arg) { struct mount *mp; struct nfsmount *nmp; int nmounts; while (1) { nmounts = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (strcmp(mp->mnt_vfc->vfc_name, "nfs4") != 0) continue; nmounts++; nmp = VFSTONFS(mp); if (time_second < nmp->nm_last_renewal + nmp->nm_lease_time - 4) continue; mtx_unlock(&mountlist_mtx); mtx_lock(&Giant); nfs4_do_renew(nmp, (struct ucred *) arg); mtx_unlock(&Giant); mtx_lock(&mountlist_mtx); } mtx_unlock(&mountlist_mtx); /* Must kill the daemon here, or module unload will cause a panic */ if (nmounts == 0) { mtx_lock(&Giant); nfs4_daemonproc = NULL; mtx_unlock(&Giant); /*printf("nfsv4 renewd exiting\n");*/ kthread_exit(0); } tsleep(&nfs4_daemonproc, PVFS, "nfs4", 2 * hz); } } /* * Common code for mount and mountroot */ static int mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, char *hst, struct vnode **vpp, struct ucred *cred) { struct nfsmount *nmp; char *rpth, *cp1, *cp2; int nlkup = 0, error; struct nfs4_compound cp; struct mbuf *mreq, *mrep = NULL, *md, *mb; caddr_t bpos, dpos; struct nfs4_oparg_lookup lkup; struct nfs4_oparg_getfh gfh; struct nfs4_oparg_getattr ga; struct thread *td = curthread; /* XXX */ if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); /* update paths, file handles, etc, here XXX */ FREE(nam, M_SONAME); return (0); } else { nmp = uma_zalloc(nfsmount_zone, M_WAITOK); bzero((caddr_t)nmp, sizeof (struct nfsmount)); TAILQ_INIT(&nmp->nm_bufq); mp->mnt_data = (qaddr_t)nmp; } vfs_getnewfsid(mp); nmp->nm_mountp = mp; nmp->nm_maxfilesize = 0xffffffffLL; nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; nmp->nm_wsize = NFS_WSIZE; nmp->nm_rsize = NFS_RSIZE; nmp->nm_readdirsize = NFS_READDIRSIZE; nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_readahead = NFS_DEFRAHEAD; nmp->nm_deadthresh = NFS_MAXDEADTHRESH; vfs_mountedfrom(mp, hst); nmp->nm_nam = nam; /* Set up the sockets and per-host congestion */ nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; nmp->nm_rpcops = &nfs4_rpcops; /* XXX */ mp->mnt_stat.f_iosize = PAGE_SIZE; argp->flags |= (NFSMNT_NFSV3 | NFSMNT_NFSV4); nfs_decode_args(nmp, argp); if ((error = nfs4_connect(nmp))) goto bad; mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, NFSX_FH(1)); mb = mreq; bpos = mtod(mb, caddr_t); ga.bm = &nfsv4_fsinfobm; nfs_v4initcompound(&cp); /* Get remote path */ rpth = hst; strsep(&rpth, ":"); nfsm_v4build_compound(&cp, "mountnfs()"); nfsm_v4build_putrootfh(&cp); for (cp1 = rpth; cp1 && *cp1; cp1 = cp2) { while (*cp1 == '/') cp1++; if (!*cp1) break; for (cp2 = cp1; *cp2 && *cp2 != '/'; cp2++) ; lkup.name = cp1; lkup.namelen = cp2 - cp1; nfsm_v4build_lookup(&cp, &lkup); nlkup++; } nfsm_v4build_getfh(&cp, &gfh); nfsm_v4build_getattr(&cp, &ga); nfsm_v4build_finalize(&cp); nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, td, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_putrootfh(&cp); while (nlkup--) nfsm_v4dissect_lookup(&cp); nfsm_v4dissect_getfh(&cp, &gfh); nfsm_v4dissect_getattr(&cp, &ga); nfs4_vfsop_fsinfo(&ga.fa, nmp); nmp->nm_state |= NFSSTA_GOTFSINFO; /* Copy root fh into nfsmount. */ nmp->nm_fhsize = gfh.fh_len; bcopy(&gfh.fh_val, nmp->nm_fh, nmp->nm_fhsize); nmp->nm_last_renewal = time_second; if ((error = nfs4_do_setclientid(nmp, cred)) != 0) goto nfsmout; /* Start renewd if it isn't already running */ if (nfs4_daemonproc == NULL) kthread_create(nfs4_daemon, crdup(cred), &nfs4_daemonproc, (RFPROC|RFMEM), 0, "nfs4rd"); return (0); nfsmout: error = nfs_v4postop(&cp, error); /* XXX */ if (mrep != NULL) m_freem(mrep); bad: nfs_disconnect(nmp); uma_zfree(nfsmount_zone, nmp); FREE(nam, M_SONAME); return (error); } /* * unmount system call */ static int nfs_unmount(struct mount *mp, int mntflags, struct thread *td) { struct nfsmount *nmp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; nmp = VFSTONFS(mp); /* * Goes something like this.. * - Call vflush(, td) to clear out vnodes for this filesystem * - Close the socket * - Free up the data structures */ /* In the forced case, cancel any outstanding requests. */ if (flags & FORCECLOSE) { error = nfs_nmcancelreqs(nmp); if (error) return (error); nfs4dev_purge(); } error = vflush(mp, 0, flags, td); if (error) return (error); /* * We are now committed to the unmount. */ nfs_disconnect(nmp); FREE(nmp->nm_nam, M_SONAME); /* XXX there's a race condition here for SMP */ wakeup(&nfs4_daemonproc); uma_zfree(nfsmount_zone, nmp); return (0); } /* * Return root of a filesystem */ static int nfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { struct vnode *vp; struct nfsmount *nmp; struct nfsnode *np; int error; nmp = VFSTONFS(mp); error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); if (error) return (error); vp = NFSTOV(np); if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Flush out the buffer cache */ /* ARGSUSED */ static int nfs_sync(struct mount *mp, int waitfor, struct thread *td) { - struct vnode *vp, *nvp; + struct vnode *vp, *mvp; int error, allerror = 0; /* * Force stale buffer cache information to be flushed. */ MNT_ILOCK(mp); loop: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (VOP_ISLOCKED(vp, NULL) || vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_ILOCK(mp); + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (allerror); } static int nfs4_do_setclientid(struct nfsmount *nmp, struct ucred *cred) { struct nfs4_oparg_setclientid scid; struct nfs4_compound cp; struct mbuf *mreq, *mrep = NULL, *md, *mb; caddr_t bpos, dpos; struct route ro; char *ipsrc = NULL, uaddr[24], name[24]; int try = 0; static unsigned long seq; int error; #ifndef NFS4_USE_RPCCLNT return (0); #endif if (nmp->nm_clientid) { printf("nfs4_do_setclientid: already have clientid!\n"); error = 0; goto nfsmout; } /* Try not to re-use clientids */ if (seq == 0) seq = time_second; #ifdef NFS4_USE_RPCCLNT scid.cb_netid = (nmp->nm_rpcclnt.rc_sotype == SOCK_STREAM) ? "tcp" : "udp"; #endif scid.cb_netid = "tcp"; scid.cb_netidlen = 3; scid.cb_prog = 0x1234; /* XXX */ /* Do a route lookup to find our source address for talking to this server */ bzero(&ro, sizeof ro); #ifdef NFS4_USE_RPCCLNT ro.ro_dst = *nmp->nm_rpcclnt.rc_name; #endif rtalloc(&ro); if (ro.ro_rt == NULL) { error = EHOSTUNREACH; goto nfsmout; } ipsrc = inet_ntoa(IA_SIN(ifatoia(ro.ro_rt->rt_ifa))->sin_addr); sprintf(uaddr, "%s.12.48", ipsrc); scid.cb_univaddr = uaddr; scid.cb_univaddrlen = strlen(uaddr); RTFREE(ro.ro_rt); try_again: sprintf(name, "%s-%d", ipsrc, (int) ((seq + try) % 1000000L)); scid.namelen = strlen(name); scid.name = name; nfs_v4initcompound(&cp); mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, NFSX_FH(1)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_v4build_compound(&cp, "nfs4_do_setclientid()"); nfsm_v4build_setclientid(&cp, &scid); nfsm_v4build_finalize(&cp); nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, curthread, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_setclientid(&cp, &scid); nmp->nm_clientid = scid.clientid; error = nfs_v4postop(&cp, error); /* Confirm */ m_freem(mrep); mreq = nfsm_reqhead(NULL, NFSV4PROC_COMPOUND, NFSX_FH(1)); mb = mreq; bpos = mtod(mb, caddr_t); nfs_v4initcompound(&cp); nfsm_v4build_compound(&cp, "nfs4_do_setclientid() (confirm)"); nfsm_v4build_setclientid_confirm(&cp, &scid); nfsm_v4build_finalize(&cp); nfsm_request_mnt(nmp, NFSV4PROC_COMPOUND, curthread, cred); if (error != 0) goto nfsmout; nfsm_v4dissect_compound(&cp); nfsm_v4dissect_setclientid_confirm(&cp); nfsmout: error = nfs_v4postop(&cp, error); if (mrep) m_freem(mrep); if (error == NFSERR_CLID_INUSE && (++try < NFS4_SETCLIENTID_MAXTRIES)) goto try_again; return (error); } Index: head/sys/nfsclient/nfs_vfsops.c =================================================================== --- head/sys/nfsclient/nfs_vfsops.c (revision 154151) +++ head/sys/nfsclient/nfs_vfsops.c (revision 154152) @@ -1,1079 +1,1080 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_vfsops.c 8.12 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_nfsroot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_NFSREQ, "nfsclient_req", "NFS request header"); MALLOC_DEFINE(M_NFSBIGFH, "nfsclient_bigfh", "NFS version 3 file handle"); MALLOC_DEFINE(M_NFSDIROFF, "nfsclient_diroff", "NFS directory offset data"); MALLOC_DEFINE(M_NFSHASH, "nfsclient_hash", "NFS hash tables"); MALLOC_DEFINE(M_NFSDIRECTIO, "nfsclient_directio", "NFS Direct IO async write state"); uma_zone_t nfsmount_zone; struct nfsstats nfsstats; SYSCTL_NODE(_vfs, OID_AUTO, nfs, CTLFLAG_RW, 0, "NFS filesystem"); SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RD, &nfsstats, nfsstats, "S,nfsstats"); static int nfs_ip_paranoia = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW, &nfs_ip_paranoia, 0, ""); #ifdef NFS_DEBUG int nfs_debug; SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0, ""); #endif static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_INITIAL_DELAY, downdelayinitial, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, ""); /* how long between console messages "nfs server foo not responding" */ static int nfs_tprintf_delay = NFS_TPRINTF_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY, downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); static int nfs_iosize(struct nfsmount *nmp); static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp); static int mountnfs(struct nfs_args *, struct mount *, struct sockaddr *, char *, struct vnode **, struct ucred *cred); static vfs_mount_t nfs_mount; static vfs_cmount_t nfs_cmount; static vfs_unmount_t nfs_unmount; static vfs_root_t nfs_root; static vfs_statfs_t nfs_statfs; static vfs_sync_t nfs_sync; static vfs_sysctl_t nfs_sysctl; /* * nfs vfs operations. */ static struct vfsops nfs_vfsops = { .vfs_init = nfs_init, .vfs_mount = nfs_mount, .vfs_cmount = nfs_cmount, .vfs_root = nfs_root, .vfs_statfs = nfs_statfs, .vfs_sync = nfs_sync, .vfs_uninit = nfs_uninit, .vfs_unmount = nfs_unmount, .vfs_sysctl = nfs_sysctl, }; VFS_SET(nfs_vfsops, nfs, VFCF_NETWORK); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfs, 1); static struct nfs_rpcops nfs_rpcops = { nfs_readrpc, nfs_writerpc, nfs_writebp, nfs_readlinkrpc, nfs_invaldir, nfs_commit, }; /* * This structure must be filled in by a primary bootstrap or bootstrap * server for a diskless/dataless machine. It is initialized below just * to ensure that it is allocated to initialized data (.data not .bss). */ struct nfs_diskless nfs_diskless = { { { 0 } } }; struct nfsv3_diskless nfsv3_diskless = { { { 0 } } }; int nfs_diskless_valid = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, diskless_valid, CTLFLAG_RD, &nfs_diskless_valid, 0, ""); SYSCTL_STRING(_vfs_nfs, OID_AUTO, diskless_rootpath, CTLFLAG_RD, nfsv3_diskless.root_hostnam, 0, ""); SYSCTL_OPAQUE(_vfs_nfs, OID_AUTO, diskless_rootaddr, CTLFLAG_RD, &nfsv3_diskless.root_saddr, sizeof nfsv3_diskless.root_saddr, "%Ssockaddr_in", ""); void nfsargs_ntoh(struct nfs_args *); static int nfs_mountdiskless(char *, int, struct sockaddr_in *, struct nfs_args *, struct thread *, struct vnode **, struct mount *); static void nfs_convert_diskless(void); static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs); static int nfs_iosize(struct nfsmount *nmp) { int iosize; /* * Calculate the size used for io buffers. Use the larger * of the two sizes to minimise nfs requests but make sure * that it is at least one VM page to avoid wasting buffer * space. */ iosize = max(nmp->nm_rsize, nmp->nm_wsize); if (iosize < PAGE_SIZE) iosize = PAGE_SIZE; return iosize; } static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs) { args->version = NFS_ARGSVERSION; args->addr = oargs->addr; args->addrlen = oargs->addrlen; args->sotype = oargs->sotype; args->proto = oargs->proto; args->fh = oargs->fh; args->fhsize = oargs->fhsize; args->flags = oargs->flags; args->wsize = oargs->wsize; args->rsize = oargs->rsize; args->readdirsize = oargs->readdirsize; args->timeo = oargs->timeo; args->retrans = oargs->retrans; args->maxgrouplist = oargs->maxgrouplist; args->readahead = oargs->readahead; args->deadthresh = oargs->deadthresh; args->hostname = oargs->hostname; } static void nfs_convert_diskless(void) { bcopy(&nfs_diskless.myif, &nfsv3_diskless.myif, sizeof(struct ifaliasreq)); bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway, sizeof(struct sockaddr_in)); nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args); nfsv3_diskless.root_fhsize = NFSX_V2FH; bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH); bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr, sizeof(struct sockaddr_in)); bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN); nfsv3_diskless.root_time = nfs_diskless.root_time; bcopy(nfs_diskless.my_hostnam, nfsv3_diskless.my_hostnam, MAXHOSTNAMELEN); nfs_diskless_valid = 3; } /* * nfs statfs call */ static int nfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { struct vnode *vp; struct nfs_statfs *sfp; caddr_t bpos, dpos; struct nfsmount *nmp = VFSTONFS(mp); int error = 0, v3 = (nmp->nm_flag & NFSMNT_NFSV3), retattr; struct mbuf *mreq, *mrep, *md, *mb; struct nfsnode *np; u_quad_t tquad; #ifndef nolint sfp = NULL; #endif error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); if (error) return (error); vp = NFSTOV(np); if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) (void)nfs_fsinfo(nmp, vp, td->td_ucred, td); nfsstats.rpccnt[NFSPROC_FSSTAT]++; mreq = nfsm_reqhead(vp, NFSPROC_FSSTAT, NFSX_FH(v3)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, v3); nfsm_request(vp, NFSPROC_FSSTAT, td, td->td_ucred); if (v3) nfsm_postop_attr(vp, retattr); if (error) { if (mrep != NULL) m_freem(mrep); goto nfsmout; } sfp = nfsm_dissect(struct nfs_statfs *, NFSX_STATFS(v3)); sbp->f_iosize = nfs_iosize(nmp); if (v3) { sbp->f_bsize = NFS_FABLKSIZE; tquad = fxdr_hyper(&sfp->sf_tbytes); sbp->f_blocks = tquad / NFS_FABLKSIZE; tquad = fxdr_hyper(&sfp->sf_fbytes); sbp->f_bfree = tquad / NFS_FABLKSIZE; tquad = fxdr_hyper(&sfp->sf_abytes); sbp->f_bavail = tquad / NFS_FABLKSIZE; sbp->f_files = (fxdr_unsigned(int32_t, sfp->sf_tfiles.nfsuquad[1]) & 0x7fffffff); sbp->f_ffree = (fxdr_unsigned(int32_t, sfp->sf_ffiles.nfsuquad[1]) & 0x7fffffff); } else { sbp->f_bsize = fxdr_unsigned(int32_t, sfp->sf_bsize); sbp->f_blocks = fxdr_unsigned(int32_t, sfp->sf_blocks); sbp->f_bfree = fxdr_unsigned(int32_t, sfp->sf_bfree); sbp->f_bavail = fxdr_unsigned(int32_t, sfp->sf_bavail); sbp->f_files = 0; sbp->f_ffree = 0; } m_freem(mrep); nfsmout: vput(vp); return (error); } /* * nfs version 3 fsinfo rpc call */ int nfs_fsinfo(struct nfsmount *nmp, struct vnode *vp, struct ucred *cred, struct thread *td) { struct nfsv3_fsinfo *fsp; u_int32_t pref, max; caddr_t bpos, dpos; int error = 0, retattr; struct mbuf *mreq, *mrep, *md, *mb; u_int64_t maxfsize; nfsstats.rpccnt[NFSPROC_FSINFO]++; mreq = nfsm_reqhead(vp, NFSPROC_FSINFO, NFSX_FH(1)); mb = mreq; bpos = mtod(mb, caddr_t); nfsm_fhtom(vp, 1); nfsm_request(vp, NFSPROC_FSINFO, td, cred); nfsm_postop_attr(vp, retattr); if (!error) { fsp = nfsm_dissect(struct nfsv3_fsinfo *, NFSX_V3FSINFO); pref = fxdr_unsigned(u_int32_t, fsp->fs_wtpref); if (pref < nmp->nm_wsize && pref >= NFS_FABLKSIZE) nmp->nm_wsize = (pref + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); max = fxdr_unsigned(u_int32_t, fsp->fs_wtmax); if (max < nmp->nm_wsize && max > 0) { nmp->nm_wsize = max & ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize == 0) nmp->nm_wsize = max; } pref = fxdr_unsigned(u_int32_t, fsp->fs_rtpref); if (pref < nmp->nm_rsize && pref >= NFS_FABLKSIZE) nmp->nm_rsize = (pref + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); max = fxdr_unsigned(u_int32_t, fsp->fs_rtmax); if (max < nmp->nm_rsize && max > 0) { nmp->nm_rsize = max & ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize == 0) nmp->nm_rsize = max; } pref = fxdr_unsigned(u_int32_t, fsp->fs_dtpref); if (pref < nmp->nm_readdirsize && pref >= NFS_DIRBLKSIZ) nmp->nm_readdirsize = (pref + NFS_DIRBLKSIZ - 1) & ~(NFS_DIRBLKSIZ - 1); if (max < nmp->nm_readdirsize && max > 0) { nmp->nm_readdirsize = max & ~(NFS_DIRBLKSIZ - 1); if (nmp->nm_readdirsize == 0) nmp->nm_readdirsize = max; } maxfsize = fxdr_hyper(&fsp->fs_maxfilesize); if (maxfsize > 0 && maxfsize < nmp->nm_maxfilesize) nmp->nm_maxfilesize = maxfsize; nmp->nm_mountp->mnt_stat.f_iosize = nfs_iosize(nmp); nmp->nm_state |= NFSSTA_GOTFSINFO; } m_freem(mrep); nfsmout: return (error); } /* * Mount a remote root fs via. nfs. This depends on the info in the * nfs_diskless structure that has been filled in properly by some primary * bootstrap. * It goes something like this: * - do enough of "ifconfig" by calling ifioctl() so that the system * can talk to the server * - If nfs_diskless.mygateway is filled in, use that address as * a default gateway. * - build the rootfs mount point and call mountnfs() to do the rest. * * It is assumed to be safe to read, modify, and write the nfsv3_diskless * structure, as well as other global NFS client variables here, as * nfs_mountroot() will be called once in the boot before any other NFS * client activity occurs. */ int nfs_mountroot(struct mount *mp, struct thread *td) { struct nfsv3_diskless *nd = &nfsv3_diskless; struct socket *so; struct vnode *vp; int error, i; u_long l; char buf[128]; NET_ASSERT_GIANT(); #if defined(BOOTP_NFSROOT) && defined(BOOTP) bootpc_init(); /* use bootp to get nfs_diskless filled in */ #elif defined(NFS_ROOT) nfs_setup_diskless(); #endif if (nfs_diskless_valid == 0) return (-1); if (nfs_diskless_valid == 1) nfs_convert_diskless(); /* * XXX splnet, so networks will receive... */ splnet(); /* * Do enough of ifconfig(8) so that the critical net interface can * talk to the server. */ error = socreate(nd->myif.ifra_addr.sa_family, &so, SOCK_DGRAM, 0, td->td_ucred, td); if (error) panic("nfs_mountroot: socreate(%04x): %d", nd->myif.ifra_addr.sa_family, error); #if 0 /* XXX Bad idea */ /* * We might not have been told the right interface, so we pass * over the first ten interfaces of the same kind, until we get * one of them configured. */ for (i = strlen(nd->myif.ifra_name) - 1; nd->myif.ifra_name[i] >= '0' && nd->myif.ifra_name[i] <= '9'; nd->myif.ifra_name[i] ++) { error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if(!error) break; } #endif error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if (error) panic("nfs_mountroot: SIOCAIFADDR: %d", error); soclose(so); /* * If the gateway field is filled in, set it as the default route. * Note that pxeboot will set a default route of 0 if the route * is not set by the DHCP server. Check also for a value of 0 * to avoid panicking inappropriately in that situation. */ if (nd->mygateway.sin_len != 0 && nd->mygateway.sin_addr.s_addr != 0) { struct sockaddr_in mask, sin; bzero((caddr_t)&mask, sizeof(mask)); sin = mask; sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); error = rtrequest(RTM_ADD, (struct sockaddr *)&sin, (struct sockaddr *)&nd->mygateway, (struct sockaddr *)&mask, RTF_UP | RTF_GATEWAY, NULL); if (error) panic("nfs_mountroot: RTM_ADD: %d", error); } /* * Create the rootfs mount point. */ nd->root_args.fh = nd->root_fh; nd->root_args.fhsize = nd->root_fhsize; l = ntohl(nd->root_saddr.sin_addr.s_addr); snprintf(buf, sizeof(buf), "%ld.%ld.%ld.%ld:%s", (l >> 24) & 0xff, (l >> 16) & 0xff, (l >> 8) & 0xff, (l >> 0) & 0xff, nd->root_hostnam); printf("NFS ROOT: %s\n", buf); if ((error = nfs_mountdiskless(buf, MNT_RDONLY, &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) { return (error); } /* * This is not really an nfs issue, but it is much easier to * set hostname here and then let the "/etc/rc.xxx" files * mount the right /var based upon its preset value. */ bcopy(nd->my_hostnam, hostname, MAXHOSTNAMELEN); hostname[MAXHOSTNAMELEN - 1] = '\0'; for (i = 0; i < MAXHOSTNAMELEN; i++) if (hostname[i] == '\0') break; inittodr(ntohl(nd->root_time)); return (0); } /* * Internal version of mount system call for diskless setup. */ static int nfs_mountdiskless(char *path, int mountflag, struct sockaddr_in *sin, struct nfs_args *args, struct thread *td, struct vnode **vpp, struct mount *mp) { struct sockaddr *nam; int error; mp->mnt_kern_flag = 0; mp->mnt_flag = mountflag; nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK); if ((error = mountnfs(args, mp, nam, path, vpp, td->td_ucred)) != 0) { printf("nfs_mountroot: mount %s on /: %d\n", path, error); return (error); } return (0); } static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp) { int s; int adjsock; int maxio; s = splnet(); /* * Set read-only flag if requested; otherwise, clear it if this is * an update. If this is not an update, then either the read-only * flag is already clear, or this is a root mount and it was set * intentionally at some previous point. */ if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) mp->mnt_flag |= MNT_RDONLY; else if (mp->mnt_flag & MNT_UPDATE) mp->mnt_flag &= ~MNT_RDONLY; /* * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes * no sense in that context. */ if (argp->sotype == SOCK_STREAM) nmp->nm_flag &= ~NFSMNT_NOCONN; /* Also clear RDIRPLUS if not NFSv3, it crashes some servers */ if ((argp->flags & NFSMNT_NFSV3) == 0) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* Re-bind if rsrvd port requested and wasn't on one */ adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT) && (argp->flags & NFSMNT_RESVPORT); /* Also re-bind if we're switching to/from a connected UDP socket */ adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) != (argp->flags & NFSMNT_NOCONN)); /* Update flags atomically. Don't change the lock bits. */ nmp->nm_flag = argp->flags | nmp->nm_flag; splx(s); if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; if (nmp->nm_timeo < NFS_MINTIMEO) nmp->nm_timeo = NFS_MINTIMEO; else if (nmp->nm_timeo > NFS_MAXTIMEO) nmp->nm_timeo = NFS_MAXTIMEO; } if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { nmp->nm_retry = argp->retrans; if (nmp->nm_retry > NFS_MAXREXMIT) nmp->nm_retry = NFS_MAXREXMIT; } if (argp->flags & NFSMNT_NFSV3) { if (argp->sotype == SOCK_DGRAM) maxio = NFS_MAXDGRAMDATA; else maxio = NFS_MAXDATA; } else maxio = NFS_V2MAXDATA; if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { nmp->nm_wsize = argp->wsize; /* Round down to multiple of blocksize */ nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize <= 0) nmp->nm_wsize = NFS_FABLKSIZE; } if (nmp->nm_wsize > maxio) nmp->nm_wsize = maxio; if (nmp->nm_wsize > MAXBSIZE) nmp->nm_wsize = MAXBSIZE; if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { nmp->nm_rsize = argp->rsize; /* Round down to multiple of blocksize */ nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize <= 0) nmp->nm_rsize = NFS_FABLKSIZE; } if (nmp->nm_rsize > maxio) nmp->nm_rsize = maxio; if (nmp->nm_rsize > MAXBSIZE) nmp->nm_rsize = MAXBSIZE; if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { nmp->nm_readdirsize = argp->readdirsize; } if (nmp->nm_readdirsize > maxio) nmp->nm_readdirsize = maxio; if (nmp->nm_readdirsize > nmp->nm_rsize) nmp->nm_readdirsize = nmp->nm_rsize; if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) nmp->nm_acregmin = argp->acregmin; else nmp->nm_acregmin = NFS_MINATTRTIMO; if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) nmp->nm_acregmax = argp->acregmax; else nmp->nm_acregmax = NFS_MAXATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) nmp->nm_acdirmin = argp->acdirmin; else nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) nmp->nm_acdirmax = argp->acdirmax; else nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; if (nmp->nm_acdirmin > nmp->nm_acdirmax) nmp->nm_acdirmin = nmp->nm_acdirmax; if (nmp->nm_acregmin > nmp->nm_acregmax) nmp->nm_acregmin = nmp->nm_acregmax; if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0) { if (argp->maxgrouplist <= NFS_MAXGRPS) nmp->nm_numgrps = argp->maxgrouplist; else nmp->nm_numgrps = NFS_MAXGRPS; } if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) { if (argp->readahead <= NFS_MAXRAHEAD) nmp->nm_readahead = argp->readahead; else nmp->nm_readahead = NFS_MAXRAHEAD; } if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) { if (argp->wcommitsize < nmp->nm_wsize) nmp->nm_wcommitsize = nmp->nm_wsize; else nmp->nm_wcommitsize = argp->wcommitsize; } if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) { if (argp->deadthresh <= NFS_MAXDEADTHRESH) nmp->nm_deadthresh = argp->deadthresh; else nmp->nm_deadthresh = NFS_MAXDEADTHRESH; } adjsock |= ((nmp->nm_sotype != argp->sotype) || (nmp->nm_soproto != argp->proto)); nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; if (nmp->nm_so && adjsock) { nfs_safedisconnect(nmp); if (nmp->nm_sotype == SOCK_DGRAM) while (nfs_connect(nmp, NULL)) { printf("nfs_args: retrying connect\n"); (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); } } } static const char *nfs_opts[] = { "from", "nfs_args", NULL }; /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the sockargs() call because sockargs() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_mount(struct mount *mp, struct thread *td) { int error; struct nfs_args args; struct sockaddr *nam; struct vnode *vp; char hst[MNAMELEN]; size_t len; u_char nfh[NFSX_V3FHMAX]; if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) return (EINVAL); if (mp->mnt_flag & MNT_ROOTFS) return (nfs_mountroot(mp, td)); error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof args); if (error) return (error); if (args.version != NFS_ARGSVERSION) { return (EPROGMISMATCH); } if (mp->mnt_flag & MNT_UPDATE) { struct nfsmount *nmp = VFSTONFS(mp); if (nmp == NULL) return (EIO); /* * When doing an update, we can't change from or to * v3, switch lockd strategies or change cookie translation */ args.flags = (args.flags & ~(NFSMNT_NFSV3 | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)) | (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)); nfs_decode_args(mp, nmp, &args); return (0); } /* * Make the nfs_ip_paranoia sysctl serve as the default connection * or no-connection mode for those protocols that support * no-connection mode (the flag will be cleared later for protocols * that do not support no-connection mode). This will allow a client * to receive replies from a different IP then the request was * sent to. Note: default value for nfs_ip_paranoia is 1 (paranoid), * not 0. */ if (nfs_ip_paranoia == 0) args.flags |= NFSMNT_NOCONN; if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) return (EINVAL); error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize); if (error) return (error); error = copyinstr(args.hostname, hst, MNAMELEN-1, &len); if (error) return (error); bzero(&hst[len], MNAMELEN - len); /* sockargs() call must be after above copyin() calls */ error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen); if (error) return (error); args.fh = nfh; error = mountnfs(&args, mp, nam, hst, &vp, td->td_ucred); return (error); } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the sockargs() call because sockargs() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { int error; struct nfs_args args; error = copyin(data, &args, sizeof (struct nfs_args)); if (error) return (error); ma = mount_arg(ma, "nfs_args", &args, sizeof args); error = kernel_mount(ma, flags); return (error); } /* * Common code for mount and mountroot */ static int mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, char *hst, struct vnode **vpp, struct ucred *cred) { struct nfsmount *nmp; struct nfsnode *np; int error; struct vattr attrs; if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); /* update paths, file handles, etc, here XXX */ FREE(nam, M_SONAME); return (0); } else { nmp = uma_zalloc(nfsmount_zone, M_WAITOK); bzero((caddr_t)nmp, sizeof (struct nfsmount)); TAILQ_INIT(&nmp->nm_bufq); mp->mnt_data = (qaddr_t)nmp; } vfs_getnewfsid(mp); nmp->nm_mountp = mp; /* * V2 can only handle 32 bit filesizes. A 4GB-1 limit may be too * high, depending on whether we end up with negative offsets in * the client or server somewhere. 2GB-1 may be safer. * * For V3, nfs_fsinfo will adjust this as necessary. Assume maximum * that we can handle until we find out otherwise. * XXX Our "safe" limit on the client is what we can store in our * buffer cache using signed(!) block numbers. */ if ((argp->flags & NFSMNT_NFSV3) == 0) nmp->nm_maxfilesize = 0xffffffffLL; else nmp->nm_maxfilesize = (u_int64_t)0x80000000 * DEV_BSIZE - 1; nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; if ((argp->flags & NFSMNT_NFSV3) && argp->sotype == SOCK_STREAM) { nmp->nm_wsize = nmp->nm_rsize = NFS_MAXDATA; } else { nmp->nm_wsize = NFS_WSIZE; nmp->nm_rsize = NFS_RSIZE; } nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000); nmp->nm_readdirsize = NFS_READDIRSIZE; nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_readahead = NFS_DEFRAHEAD; nmp->nm_deadthresh = NFS_MAXDEADTHRESH; nmp->nm_tprintf_delay = nfs_tprintf_delay; if (nmp->nm_tprintf_delay < 0) nmp->nm_tprintf_delay = 0; nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; nmp->nm_fhsize = argp->fhsize; bcopy((caddr_t)argp->fh, (caddr_t)nmp->nm_fh, argp->fhsize); bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); nmp->nm_nam = nam; /* Set up the sockets and per-host congestion */ nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; nmp->nm_rpcops = &nfs_rpcops; nfs_decode_args(mp, nmp, argp); if (nmp->nm_sotype == SOCK_STREAM) mtx_init(&nmp->nm_nfstcpstate.mtx, "NFS/TCP state lock", NULL, MTX_DEF); /* * For Connection based sockets (TCP,...) defer the connect until * the first request, in case the server is not responding. */ if (nmp->nm_sotype == SOCK_DGRAM && (error = nfs_connect(nmp, NULL))) goto bad; /* * This is silly, but it has to be set so that vinifod() works. * We do not want to do an nfs_statfs() here since we can get * stuck on a dead server and we are holding a lock on the mount * point. */ mp->mnt_stat.f_iosize = nfs_iosize(nmp); /* * A reference count is needed on the nfsnode representing the * remote root. If this object is not persistent, then backward * traversals of the mount point (i.e. "..") will not work if * the nfsnode gets flushed out of the cache. Ufs does not have * this problem, because one can identify root inodes by their * number == ROOTINO (2). */ error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); if (error) goto bad; *vpp = NFSTOV(np); /* * Get file attributes and transfer parameters for the * mountpoint. This has the side effect of filling in * (*vpp)->v_type with the correct value. */ if (argp->flags & NFSMNT_NFSV3) nfs_fsinfo(nmp, *vpp, curthread->td_ucred, curthread); else VOP_GETATTR(*vpp, &attrs, curthread->td_ucred, curthread); /* * Lose the lock but keep the ref. */ VOP_UNLOCK(*vpp, 0, curthread); return (0); bad: if (nmp->nm_sotype == SOCK_STREAM) mtx_destroy(&nmp->nm_nfstcpstate.mtx); nfs_disconnect(nmp); uma_zfree(nfsmount_zone, nmp); FREE(nam, M_SONAME); return (error); } /* * unmount system call */ static int nfs_unmount(struct mount *mp, int mntflags, struct thread *td) { struct nfsmount *nmp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; nmp = VFSTONFS(mp); /* * Goes something like this.. * - Call vflush() to clear out vnodes for this filesystem * - Close the socket * - Free up the data structures */ /* In the forced case, cancel any outstanding requests. */ if (flags & FORCECLOSE) { error = nfs_nmcancelreqs(nmp); if (error) return (error); } /* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */ error = vflush(mp, 1, flags, td); if (error) return (error); /* * We are now committed to the unmount. */ nfs_disconnect(nmp); FREE(nmp->nm_nam, M_SONAME); if (nmp->nm_sotype == SOCK_STREAM) mtx_destroy(&nmp->nm_nfstcpstate.mtx); uma_zfree(nfsmount_zone, nmp); return (0); } /* * Return root of a filesystem */ static int nfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { struct vnode *vp; struct nfsmount *nmp; struct nfsnode *np; int error; nmp = VFSTONFS(mp); error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); if (error) return (error); vp = NFSTOV(np); /* * Get transfer parameters and attributes for root vnode once. */ if ((nmp->nm_state & NFSSTA_GOTFSINFO) == 0 && (nmp->nm_flag & NFSMNT_NFSV3)) { nfs_fsinfo(nmp, vp, curthread->td_ucred, curthread); } if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Flush out the buffer cache */ /* ARGSUSED */ static int nfs_sync(struct mount *mp, int waitfor, struct thread *td) { - struct vnode *vp, *nvp; + struct vnode *vp, *mvp; int error, allerror = 0; /* * Force stale buffer cache information to be flushed. */ MNT_ILOCK(mp); loop: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (VOP_ISLOCKED(vp, NULL) || vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_ILOCK(mp); + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (allerror); } static int nfs_sysctl(struct mount *mp, fsctlop_t op, struct sysctl_req *req) { struct nfsmount *nmp = VFSTONFS(mp); struct vfsquery vq; int error; bzero(&vq, sizeof(vq)); switch (op) { #if 0 case VFS_CTL_NOLOCKS: val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0; if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &val, sizeof(val)); if (error) return (error); } if (req->newptr != NULL) { error = SYSCTL_IN(req, &val, sizeof(val)); if (error) return (error); if (val) nmp->nm_flag |= NFSMNT_NOLOCKS; else nmp->nm_flag &= ~NFSMNT_NOLOCKS; } break; #endif case VFS_CTL_QUERY: if (nmp->nm_state & NFSSTA_TIMEO) vq.vq_flags |= VQ_NOTRESP; #if 0 if (!(nmp->nm_flag & NFSMNT_NOLOCKS) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) vq.vq_flags |= VQ_NOTRESPLOCK; #endif error = SYSCTL_OUT(req, &vq, sizeof(vq)); break; case VFS_CTL_TIMEO: if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); } if (req->newptr != NULL) { error = vfs_suser(mp, req->td); if (error) return (error); error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; } break; default: return (ENOTSUP); } return (0); } Index: head/sys/sys/mount.h =================================================================== --- head/sys/sys/mount.h (revision 154151) +++ head/sys/sys/mount.h (revision 154152) @@ -1,688 +1,702 @@ /*- * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 * $FreeBSD$ */ #ifndef _SYS_MOUNT_H_ #define _SYS_MOUNT_H_ #include #include #ifdef _KERNEL #include #include #include #endif typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ /* * File identifier. * These are unique per filesystem on a single machine. */ #define MAXFIDSZ 16 struct fid { u_short fid_len; /* length of data in bytes */ u_short fid_reserved; /* force longword alignment */ char fid_data[MAXFIDSZ]; /* data (variable length) */ }; /* * filesystem statistics */ #define MFSNAMELEN 16 /* length of type name including null */ #define MNAMELEN 88 /* size of on/from name bufs */ #define STATFS_VERSION 0x20030518 /* current version number */ struct statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[MFSNAMELEN]; /* filesystem type name */ char f_mntfromname[MNAMELEN]; /* mounted filesystem */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ }; #ifdef _KERNEL #define OMFSNAMELEN 16 /* length of fs type name, including null */ #define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ /* XXX getfsstat.2 is out of date with write and read counter changes here. */ /* XXX statfs.2 is out of date with read counter changes here. */ struct ostatfs { long f_spare2; /* placeholder */ long f_bsize; /* fundamental filesystem block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in filesystem */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in filesystem */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* filesystem id */ uid_t f_owner; /* user that mounted the filesystem */ int f_type; /* type of filesystem */ int f_flags; /* copy of mount exported flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ char f_fstypename[OMFSNAMELEN]; /* fs type name */ char f_mntonname[OMNAMELEN]; /* directory on which mounted */ long f_syncreads; /* count of sync reads since mount */ long f_asyncreads; /* count of async reads since mount */ short f_spares1; /* unused spare */ char f_mntfromname[OMNAMELEN];/* mounted filesystem */ short f_spares2; /* unused spare */ /* * XXX on machines where longs are aligned to 8-byte boundaries, there * is an unnamed int32_t here. This spare was after the apparent end * of the struct until we bit off the read counters from f_mntonname. */ long f_spare[2]; /* unused spare */ }; #define MMAXOPTIONLEN 65536 /* maximum length of a mount option */ TAILQ_HEAD(vnodelst, vnode); struct vfsoptlist; struct vfsopt; /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are * put on a doubly linked list. * */ struct mount { TAILQ_ENTRY(mount) mnt_list; /* mount list */ struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ struct vnode *mnt_vnodecovered; /* vnode we mounted on */ struct vnode *mnt_syncer; /* syncer vnode */ struct vnodelst mnt_nvnodelist; /* list of vnodes this mount */ struct lock mnt_lock; /* mount structure lock */ struct mtx mnt_mtx; /* mount structure interlock */ int mnt_writeopcount; /* write syscalls in progress */ u_int mnt_flag; /* flags shared with user */ struct vfsoptlist *mnt_opt; /* current mount options */ struct vfsoptlist *mnt_optnew; /* new options passed to fs */ int mnt_kern_flag; /* kernel only flags */ int mnt_maxsymlinklen; /* max size of short symlink */ struct statfs mnt_stat; /* cache of filesystem stats */ struct ucred *mnt_cred; /* credentials of mounter */ void * mnt_data; /* private data */ time_t mnt_time; /* last time written*/ int mnt_iosize_max; /* max size for clusters, etc */ struct netexport *mnt_export; /* export list */ struct label *mnt_mntlabel; /* MAC label for the mount */ struct label *mnt_fslabel; /* MAC label for the fs */ int mnt_nvnodelistsize; /* # of vnodes on this mount */ u_int mnt_hashseed; /* Random seed for vfs_hash */ + int mnt_markercnt; /* marker vnodes in use */ + int mnt_holdcnt; /* hold count */ + int mnt_holdcntwaiters; /* waits on hold count */ }; -struct vnode *__mnt_vnode_next(struct vnode **nvp, struct mount *mp); +struct vnode *__mnt_vnode_next(struct vnode **mvp, struct mount *mp); +struct vnode *__mnt_vnode_first(struct vnode **mvp, struct mount *mp); +void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp); -#define MNT_VNODE_FOREACH(vp, mp, vp2) \ - for ((vp2) = TAILQ_FIRST(&(mp)->mnt_nvnodelist); \ - (vp = __mnt_vnode_next(&(vp2), (mp))) != NULL;) +#define MNT_VNODE_FOREACH(vp, mp, mvp) \ + for (vp = __mnt_vnode_first(&(mvp), (mp)); \ + (vp) != NULL; vp = __mnt_vnode_next(&(mvp), (mp))) +#define MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp) \ + __mnt_vnode_markerfree(&(mvp), (mp)) + +#define MNT_VNODE_FOREACH_ABORT(mp, mvp) \ + do { \ + MNT_ILOCK(mp); \ + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); \ + MNT_IUNLOCK(mp); \ + } while (0) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) #endif /* _KERNEL */ /* * User specifiable flags. */ #define MNT_RDONLY 0x00000001 /* read only filesystem */ #define MNT_SYNCHRONOUS 0x00000002 /* filesystem written synchronously */ #define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */ #define MNT_NOSUID 0x00000008 /* don't honor setuid bits on fs */ #define MNT_UNION 0x00000020 /* union with underlying filesystem */ #define MNT_ASYNC 0x00000040 /* filesystem written asynchronously */ #define MNT_SUIDDIR 0x00100000 /* special handling of SUID on dirs */ #define MNT_SOFTDEP 0x00200000 /* soft updates being done */ #define MNT_NOSYMFOLLOW 0x00400000 /* do not follow symlinks */ #define MNT_MULTILABEL 0x04000000 /* MAC support for individual objects */ #define MNT_ACLS 0x08000000 /* ACL support enabled */ #define MNT_NOATIME 0x10000000 /* disable update of file access time */ #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */ #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */ /* * NFS export related mount flags. */ #define MNT_EXRDONLY 0x00000080 /* exported read only */ #define MNT_EXPORTED 0x00000100 /* filesystem is exported */ #define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ #define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone */ #define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping */ #define MNT_EXPUBLIC 0x20000000 /* public export (WebNFS) */ /* * Flags set by internal operations, * but visible to the user. * XXX some of these are not quite right.. (I've never seen the root flag set) */ #define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ #define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ #define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ #define MNT_USER 0x00008000 /* mounted by a user */ #define MNT_IGNORE 0x00800000 /* do not show entry in df */ /* * Mask of flags that are visible to statfs(). * XXX I think that this could now become (~(MNT_CMDFLAGS)) * but the 'mount' program may need changing to handle this. */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_UNION | \ MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_USER | MNT_QUOTA | \ MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_MULTILABEL | MNT_ACLS) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ MNT_NOATIME | \ MNT_NOSYMFOLLOW | MNT_IGNORE | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS | MNT_USER) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. * XXX: These are not STATES and really should be somewhere else. * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for * mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless. */ #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ #define MNT_DELEXPORT 0x00020000 /* delete export host lists */ #define MNT_RELOAD 0x00040000 /* reload filesystem data */ #define MNT_FORCE 0x00080000 /* force unmount or readonly change */ #define MNT_SNAPSHOT 0x01000000 /* snapshot the filesystem */ #define MNT_BYFSID 0x08000000 /* specify filesystem by ID. */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_BYFSID) /* * Still available. */ #define MNT_SPARE_0x00000010 0x00000010 #define MNT_SPARE_0x02000000 0x02000000 /* * Internal filesystem control flags stored in mnt_kern_flag. * * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed * past the mount point. This keeps the subtree stable during mounts * and unmounts. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ #define MNTK_MPSAFE 0x20000000 /* Filesystem is MPSAFE. */ #define MNTK_NOKNOTE 0x80000000 /* Don't send KNOTEs from VOP hooks */ /* * Sysctl CTL_VFS definitions. * * Second level identifier specifies which filesystem. Second level * identifier VFS_VFSCONF returns information about all filesystems. * Second level identifier VFS_GENERIC is non-terminal. */ #define VFS_VFSCONF 0 /* get configured filesystems */ #define VFS_GENERIC 0 /* generic filesystem information */ /* * Third level identifiers for VFS_GENERIC are given below; third * level identifiers for specific filesystems are given in their * mount specific header files. */ #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ /* * Flags for various system call interfaces. * * waitfor flags to vfs_sync() and getfsstat() */ #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_LAZY 3 /* push data not written by filesystem syncer */ /* * Generic file handle */ struct fhandle { fsid_t fh_fsid; /* Filesystem id of mount point */ struct fid fh_fid; /* Filesys specific id */ }; typedef struct fhandle fhandle_t; /* * Export arguments for local filesystem mount calls. */ struct export_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ }; /* * Structure holding information for a publicly exported filesystem * (WebNFS). Currently the specs allow just for one such filesystem. */ struct nfs_public { int np_valid; /* Do we hold valid information */ fhandle_t np_handle; /* Filehandle for pub fs (internal) */ struct mount *np_mount; /* Mountpoint of exported fs */ char *np_index; /* Index file */ }; /* * Filesystem configuration information. One of these exists for each * type of filesystem supported by the kernel. These are searched at * mount time to identify the requested filesystem. * * XXX: Never change the first two arguments! */ struct vfsconf { u_int vfc_version; /* ABI version number */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ struct vfsops *vfc_vfsops; /* filesystem operations vector */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsoptdecl *vfc_opts; /* mount options */ TAILQ_ENTRY(vfsconf) vfc_list; /* list of vfscons */ }; /* Userland version of the struct vfsconf. */ struct xvfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsconf *vfc_next; /* next in list */ }; #ifndef BURN_BRIDGES struct ovfsconf { void *vfc_vfsops; char vfc_name[32]; int vfc_index; int vfc_refcount; int vfc_flags; }; #endif /* * NB: these flags refer to IMPLEMENTATION properties, not properties of * any actual mounts; i.e., it does not make sense to change the flags. */ #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ #define VFCF_NETWORK 0x00020000 /* may get data over the network */ #define VFCF_READONLY 0x00040000 /* writes are not implemented */ #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files */ #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode*/ typedef uint32_t fsctlop_t; struct vfsidctl { int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ fsid_t vc_fsid; /* fsid to operate on. */ char vc_fstypename[MFSNAMELEN]; /* type of fs 'nfs' or '*' */ fsctlop_t vc_op; /* operation VFS_CTL_* (below) */ void *vc_ptr; /* pointer to data structure. */ size_t vc_len; /* sizeof said structure. */ u_int32_t vc_spare[12]; /* spare (must be zero). */ }; /* vfsidctl API version. */ #define VFS_CTL_VERS1 0x01 /* * New style VFS sysctls, do not reuse/conflict with the namespace for * private sysctls. * All "global" sysctl ops have the 33rd bit set: * 0x...1.... * Private sysctl ops should have the 33rd bit unset. */ #define VFS_CTL_QUERY 0x00010001 /* anything wrong? (vfsquery) */ #define VFS_CTL_TIMEO 0x00010002 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010003 /* disable file locking */ struct vfsquery { u_int32_t vq_flags; u_int32_t vq_spare[31]; }; /* vfsquery flags */ #define VQ_NOTRESP 0x0001 /* server down */ #define VQ_NEEDAUTH 0x0002 /* server bad auth */ #define VQ_LOWDISK 0x0004 /* we're low on space */ #define VQ_MOUNT 0x0008 /* new filesystem arrived */ #define VQ_UNMOUNT 0x0010 /* filesystem has left */ #define VQ_DEAD 0x0020 /* filesystem is dead, needs force unmount */ #define VQ_ASSIST 0x0040 /* filesystem needs assistance from external program */ #define VQ_NOTRESPLOCK 0x0080 /* server lockd down */ #define VQ_FLAG0100 0x0100 /* placeholder */ #define VQ_FLAG0200 0x0200 /* placeholder */ #define VQ_FLAG0400 0x0400 /* placeholder */ #define VQ_FLAG0800 0x0800 /* placeholder */ #define VQ_FLAG1000 0x1000 /* placeholder */ #define VQ_FLAG2000 0x2000 /* placeholder */ #define VQ_FLAG4000 0x4000 /* placeholder */ #define VQ_FLAG8000 0x8000 /* placeholder */ #ifdef _KERNEL /* Point a sysctl request at a vfsidctl's data. */ #define VCTLTOREQ(vc, req) \ do { \ (req)->newptr = (vc)->vc_ptr; \ (req)->newlen = (vc)->vc_len; \ (req)->newidx = 0; \ } while (0) #endif struct iovec; struct uio; #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MOUNT); #endif extern int maxvfsconf; /* highest defined filesystem type */ extern int nfs_mount_type; /* vfc_typenum for nfs, or -1 */ TAILQ_HEAD(vfsconfhead, vfsconf); extern struct vfsconfhead vfsconf; /* * Operations supported on mounted filesystem. */ struct mount_args; struct nameidata; struct sysctl_req; struct mntarg; typedef int vfs_cmount_t(struct mntarg *ma, void *data, int flags, struct thread *td); typedef int vfs_unmount_t(struct mount *mp, int mntflags, struct thread *td); typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp, struct thread *td); typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg, struct thread *td); typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp, struct thread *td); typedef int vfs_sync_t(struct mount *mp, int waitfor, struct thread *td); typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, struct vnode **vpp); typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp); typedef int vfs_vptofh_t(struct vnode *vp, struct fid *fhp); typedef int vfs_init_t(struct vfsconf *); typedef int vfs_uninit_t(struct vfsconf *); typedef int vfs_extattrctl_t(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td); typedef int vfs_mount_t(struct mount *mp, struct thread *td); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); struct vfsops { vfs_mount_t *vfs_mount; vfs_cmount_t *vfs_cmount; vfs_unmount_t *vfs_unmount; vfs_root_t *vfs_root; vfs_quotactl_t *vfs_quotactl; vfs_statfs_t *vfs_statfs; vfs_sync_t *vfs_sync; vfs_vget_t *vfs_vget; vfs_fhtovp_t *vfs_fhtovp; vfs_checkexp_t *vfs_checkexp; vfs_vptofh_t *vfs_vptofh; vfs_init_t *vfs_init; vfs_uninit_t *vfs_uninit; vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; }; vfs_statfs_t __vfs_statfs; #define VFS_MOUNT(MP, P) (*(MP)->mnt_op->vfs_mount)(MP, P) #define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P) #define VFS_ROOT(MP, FLAGS, VPP, P) \ (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP, P) #define VFS_QUOTACTL(MP,C,U,A,P) (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P) #define VFS_STATFS(MP, SBP, P) __vfs_statfs((MP), (SBP), (P)) #define VFS_SYNC(MP, WAIT, P) (*(MP)->mnt_op->vfs_sync)(MP, WAIT, P) #define VFS_VGET(MP, INO, FLAGS, VPP) \ (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP) #define VFS_FHTOVP(MP, FIDP, VPP) \ (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, VPP) #define VFS_VPTOFH(VP, FIDP) (*(VP)->v_mount->mnt_op->vfs_vptofh)(VP, FIDP) #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED) \ (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED) #define VFS_EXTATTRCTL(MP, C, FN, NS, N, P) \ (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N, P) #define VFS_SYSCTL(MP, OP, REQ) \ (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ) extern int mpsafe_vfs; #define VFS_NEEDSGIANT(MP) \ (!mpsafe_vfs || ((MP) != NULL && ((MP)->mnt_kern_flag & MNTK_MPSAFE) == 0)) #define VFS_LOCK_GIANT(MP) __extension__ \ ({ \ int _locked; \ if (VFS_NEEDSGIANT((MP))) { \ mtx_lock(&Giant); \ _locked = 1; \ } else \ _locked = 0; \ _locked; \ }) #define VFS_UNLOCK_GIANT(locked) if ((locked)) mtx_unlock(&Giant); #define VFS_ASSERT_GIANT(MP) do \ { \ if (VFS_NEEDSGIANT((MP))) \ mtx_assert(&Giant, MA_OWNED); \ } while (0) #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 1); \ } while (0) #define VFS_KNOTE_UNLOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 0); \ } while (0) #include /* * Version numbers. */ #define VFS_VERSION_00 0x19660120 #define VFS_VERSION VFS_VERSION_00 #define VFS_SET(vfsops, fsname, flags) \ static struct vfsconf fsname ## _vfsconf = { \ .vfc_version = VFS_VERSION, \ .vfc_name = #fsname, \ .vfc_vfsops = &vfsops, \ .vfc_typenum = -1, \ .vfc_flags = flags, \ }; \ static moduledata_t fsname ## _mod = { \ #fsname, \ vfs_modevent, \ & fsname ## _vfsconf \ }; \ DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) extern char *mountrootfsname; /* * exported vnode operations */ int dounmount(struct mount *, int, struct thread *); int kernel_mount(struct mntarg *ma, int flags); int kernel_vmount(int flags, ...); struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len); struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name); struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...); struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len); struct vfsconf *vfs_byname(const char *); struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *); void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); int vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val); int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); char *vfs_getopts(struct vfsoptlist *, const char *, int *error); int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); int vfs_filteropt(struct vfsoptlist *, const char **legal); int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); int vfs_lock(struct mount *); /* lock a vfs */ void vfs_msync(struct mount *, int); void vfs_unlock(struct mount *); /* unlock a vfs */ int vfs_busy(struct mount *, int, struct mtx *, struct thread *); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); int vfs_allocate_syncvnode(struct mount *); void vfs_getnewfsid(struct mount *); struct cdev *vfs_getrootfsid(struct mount *); struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *, struct thread *); void vfs_unmountall(void); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx mountlist_mtx; extern struct nfs_public nfs_pub; /* * Declarations for these vfs default operations are located in * kern/vfs_default.c, they should be used instead of making "dummy" * functions or casting entries in the VFS op table to "enopnotsupp()". */ vfs_root_t vfs_stdroot; vfs_quotactl_t vfs_stdquotactl; vfs_statfs_t vfs_stdstatfs; vfs_sync_t vfs_stdsync; vfs_sync_t vfs_stdnosync; vfs_vget_t vfs_stdvget; vfs_fhtovp_t vfs_stdfhtovp; vfs_checkexp_t vfs_stdcheckexp; vfs_vptofh_t vfs_stdvptofh; vfs_init_t vfs_stdinit; vfs_uninit_t vfs_stduninit; vfs_extattrctl_t vfs_stdextattrctl; vfs_sysctl_t vfs_stdsysctl; /* XXX - these should be indirect functions!!! */ int softdep_fsync(struct vnode *); int softdep_process_worklist(struct mount *); #else /* !_KERNEL */ #include struct stat; __BEGIN_DECLS int fhopen(const struct fhandle *, int); int fhstat(const struct fhandle *, struct stat *); int fhstatfs(const struct fhandle *, struct statfs *); int fstatfs(int, struct statfs *); int getfh(const char *, fhandle_t *); int getfsstat(struct statfs *, long, int); int getmntinfo(struct statfs **, int); int lgetfh(const char *, fhandle_t *); int mount(const char *, const char *, int, void *); int nmount(struct iovec *, unsigned int, int); int statfs(const char *, struct statfs *); int unmount(const char *, int); /* C library stuff */ int getvfsbyname(const char *, struct xvfsconf *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_MOUNT_H_ */ Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 154151) +++ head/sys/sys/vnode.h (revision 154152) @@ -1,730 +1,731 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ /* * XXX - compatability until lockmgr() goes away or all the #includes are * updated. */ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ -enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; +enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, + VMARKER }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * f - freelist mutex * G - Giant * i - interlock * m - mntvnodes mutex * p - pollinfo lock * s - spechash mutex * S - syncer mutex * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). * * XXX Not all fields are locked yet and some fields that are marked are not * locked consistently. This is a work in progress. Requires Giant! */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type; /* u vnode type */ const char *v_tag; /* u type of underlying data */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. * See #defines below for renaming to v_* namespace. */ union { struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ struct socket *vu_socket; /* v unix domain net (VSOCK) */ struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ } v_un; /* * vfs_hash: (mount + inode) -> vnode hash. */ LIST_ENTRY(vnode) v_hashlist; u_int v_hash; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct vnode *v_dd; /* c .. vnode */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ int v_holdcnt; /* i prevents recycling. */ int v_usecount; /* i ref count of users */ u_long v_iflag; /* i vnode flags (see below) */ u_long v_vflag; /* v vnode flags */ int v_writecount; /* v ref count of writers */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_freelist; /* f vnode freelist */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define v_mountedhere v_un.vu_mount #define v_socket v_un.vu_socket #define v_rdev v_un.vu_cdev #define v_fifoinfo v_un.vu_fifoinfo /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* socket, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), (a)); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, 1) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag */ #define VI_MOUNT 0x0020 /* Mount in progress */ #define VI_AGE 0x0040 /* Insert vnode at head of free list */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ #define VI_OBJDIRTY 0x0400 /* object might be dirty */ #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x1000 /* Need to call inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ dev_t va_fsid; /* filesystem id */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_MARK_ATIME 0x04 /* setting atime for execve/mmap */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Modes. Some values same as Ixxx entries from inode.h for now. */ #define VEXEC 000100 /* execute/search permission */ #define VWRITE 000200 /* write permission */ #define VREAD 000400 /* read permission */ #define VSVTX 001000 /* save swapped text even after use */ #define VSGID 002000 /* set group id on execution */ #define VSUID 004000 /* set user id on execution */ #define VADMIN 010000 /* permission to administer */ #define VSTAT 020000 /* permission to retrieve attrs */ #define VAPPEND 040000 /* permission to write/append */ #define VALLPERM (VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int async_io_version; /* 0 or POSIX version of AIO i'face */ extern int desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ extern void (*lease_updatetime)(int deltat); #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str); #endif void assert_vop_locked(struct vnode *vp, const char *str); #if 0 voi0 assert_vop_slocked(struct vnode *vp, const char *str); #endif void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str)) #endif #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) assert_vop_slocked((vp), (str)) #endif #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) #define ASSERT_VI_UNLOCKED(vp, str) #define ASSERT_VOP_ELOCKED(vp, str) #if 0 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) #endif #define ASSERT_VOP_LOCKED(vp, str) #if 0 #define ASSERT_VOP_SLOCKED(vp, str) #endif #define ASSERT_VOP_UNLOCKED(vp, str) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ #include "vnode_if.h" /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vnode; extern int (*lease_check_hook)(struct vop_lease_args *); extern int (*softdep_process_worklist_hook)(struct mount *); /* cache_* may belong in namei.h. */ void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); void cache_purge(struct vnode *vp); void cache_purgevfs(struct mount *mp); int cache_leaf_test(struct vnode *vp); int change_dir(struct vnode *vp, struct thread *td); int change_root(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void cvtnstat(struct stat *sb, struct nstat *nsb); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); u_quad_t init_va_filerev(void); int lease_check(struct vop_lease_args *ap); int speedup_syncer(void); #define textvp_fullpath(p, rb, rfb) \ vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb) int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, mode_t acc_mode, struct ucred *cred, int *privused); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); void vdrop(struct vnode *); void vfs_add_vnodeops(const void *); void vfs_rm_vnodeops(const void *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int lockflag, struct thread *td); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); int vinvalbuf(struct vnode *vp, int save, struct thread *td, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); #define vprint(label, vp) vn_printf((vp), "%s\n", (label)) int vrecycle(struct vnode *vp, struct thread *td); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); void vn_finished_write(struct mount *mp); int vn_isdisk(struct vnode *vp, int *errp); int vn_lock(struct vnode *vp, int flags, struct thread *td); int vn_open(struct nameidata *ndp, int *flagp, int cmode, int fdidx); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, struct ucred *cred, int fdidx); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, int *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_write_suspend_wait(struct vnode *vp, struct mount *mp, int flags); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp); int vfs_write_suspend(struct mount *mp); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); /* These are called from within the actual VOPS. */ void vop_create_post(void *a, int rc); void vop_link_post(void *a, int rc); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_post(void *a, int rc); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_post(void *a, int rc); void vop_strategy_pre(void *a); void vop_symlink_post(void *a, int rc); void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error, osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred, \ curthread); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); int vrefcnt(struct vnode *vp); void v_addpollinfo(struct vnode *vp); int vnode_create_vobject(struct vnode *vp, size_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); int vfs_hash_get(struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct thread *td); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 154151) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 154152) @@ -1,2231 +1,2233 @@ /*- * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #define DEBUG 1 #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { return (EINVAL); } int ffs_snapblkfree(fs, devvp, bno, size, inum) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; { return (EINVAL); } void ffs_snapremove(vp) struct vnode *vp; { } void ffs_snapshot_mount(mp) struct mount *mp; { } void ffs_snapshot_unmount(mp) struct mount *mp; { } void ffs_snapgone(ip) struct inode *ip; { } int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { return (EINVAL); } #else TAILQ_HEAD(snaphead, inode); struct snapdata { struct snaphead sn_head; daddr_t sn_listsize; daddr_t *sn_blklist; struct lock sn_lock; }; static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the following flag allows this * crash persistence to be disabled. */ int dopersistence = 0; #ifdef DEBUG #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; int flag = mp->mnt_flag; struct timespec starttime = {0, 0}, endtime; char saved_nice = 0; long redo = 0, snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp, *sbp = NULL; struct nameidata nd; struct mount *wrtmp; struct vattr vat; - struct vnode *vp, *xvp, *nvp, *devvp; + struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; ump = VFSTOUFS(mp); fs = ump->um_fs; /* * XXX: make sure we don't go to out1 before we setup sn */ sn = (void *)0xdeadbeef; /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); vput(nd.ni_dvp); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); return (error); } vp = nd.ni_vp; ip = VTOI(vp); devvp = ip->i_devvp; /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); DIP_SET(ip, i_size, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; if ((error = readblock(vp, bp, numblks - 1)) != 0) goto out; bawrite(bp); /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = howmany(fs->fs_ncg, NBBY); MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (error) goto out; } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) goto out; VOP_UNLOCK(vp, 0, td); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { PROC_LOCK(td->td_proc); mtx_lock_spin(&sched_lock); saved_nice = td->td_proc->p_nice; sched_nice(td->td_proc, 0); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (collectsnapstats) nanotime(&starttime); /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; redo++; error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto out1; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &sbp); if (error) { brelse(sbp); sbp = NULL; goto out1; } loc = blkoff(fs, fs->fs_sblockloc); copy_fs = (struct fs *)(sbp->b_data + loc); bcopy(fs, copy_fs, fs->fs_sbsize); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; goto out1; } bcopy(bp->b_data, space, (u_int)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the needed size for the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; loop: - MNT_VNODE_FOREACH(xvp, mp, nvp) { + MNT_VNODE_FOREACH(xvp, mp, mvp) { VI_LOCK(xvp); MNT_IUNLOCK(mp); if ((xvp->v_iflag & VI_DOOMED) || xvp->v_usecount == 0 || xvp->v_type == VNON || (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { VI_UNLOCK(xvp); MNT_ILOCK(mp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); MNT_ILOCK(mp); continue; } if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { MNT_ILOCK(mp); + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } if (snapdebug) vprint("ffs_snapshot: busy vnode", xvp); if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0, td); MNT_ILOCK(mp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp, 0, td); MNT_ILOCK(mp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode); VOP_UNLOCK(xvp, 0, td); if (error) { free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; + MNT_VNODE_FOREACH_ABORT(mp, mvp); goto out1; } MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the snapshots * to share. In either case, acquire the snapshot lock and give * up our original private lock. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn != NULL) { xp = TAILQ_FIRST(&sn->sn_head); VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; } else { VI_UNLOCK(devvp); sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; mp_fixme("si_snapdata setting is racey."); devvp->v_rdev->si_snapdata = sn; xp = NULL; } lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(vp), td); transferlockers(&vp->v_lock, vp->v_vnlock); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg) > blkno)) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %d already on list", ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); vp->v_vflag |= VV_SYSTEM; out1: KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount); vn_start_write(NULL, &wrtmp, V_WAIT); if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } if (sbp == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP); if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (ip->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); if (error) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len;; auio.uio_offset = ip->i_size; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) FREE(space, M_UFSMNT); /* * If another process is currently writing the buffer containing * the inode for this snapshot then a deadlock can occur. Drop * the snapshot lock until the buffer has been written. */ VOP_UNLOCK(vp, 0, td); (void) bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int) fs->fs_bsize, NOCRED, &nbp); brelse(nbp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); done: FREE(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); out: if (saved_nice > 0) { PROC_LOCK(td->td_proc); mtx_lock_spin(&sched_lock); sched_nice(td->td_proc, saved_nice); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { FREE(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); mp->mnt_flag = flag; if (error) (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td); (void) ffs_syncvnode(vp, MNT_WAIT); if (error) vput(vp); else VOP_UNLOCK(vp, 0, td); vn_finished_write(wrtmp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(cg, vp, nbp, passno) int cg; struct vnode *vp; struct buf *nbp; int passno; { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ip->i_fs; error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, KERNCRED, &bp); if (error) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (EIO); } UFS_LOCK(ip->i_ump); ACTIVESET(fs, cg); UFS_UNLOCK(ip->i_ump); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < NDADDR) { for ( ; loc < NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } indiroff = (base + loc - NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } indiroff = 0; } if (ip->i_ump->um_fstype == UFS1) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs1_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din1->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs2_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din2->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ip->i_devvp->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %d\n", ip->i_number); /* * Delete snapshot inode from superblock. Keep list dense. */ fs = ip->i_fs; ump = ip->i_ump; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip; struct vnode *devvp; struct lock *lkp; struct buf *ibp; struct fs *fs; struct thread *td = curthread; ufs2_daddr_t numblks, blkno, dblk, *snapblklist; int error, loc, last; struct snapdata *sn; ip = VTOI(vp); fs = ip->i_fs; devvp = ip->i_devvp; sn = devvp->v_rdev->si_snapdata; /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ if (ip->i_nextsnap.tqe_prev != 0) { lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td); VI_LOCK(devvp); TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; lkp = vp->v_vnlock; vp->v_vnlock = &vp->v_lock; lockmgr(lkp, LK_RELEASE, NULL, td); if (TAILQ_FIRST(&sn->sn_head) != 0) { VI_UNLOCK(devvp); } else { snapblklist = sn->sn_blklist; sn->sn_blklist = 0; sn->sn_listsize = 0; devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); lockmgr(lkp, LK_RELEASE, NULL, td); lockdestroy(lkp); free(sn, M_UFSMNT); FREE(snapblklist, M_UFSMNT); } } /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (ip->i_ump->um_fstype == UFS1) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(fs, devvp, bno, size, inum) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; { struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); /* * Lookup block being written. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %d lbn %jd from inum %d\n", "Grabonremove: snapino", ip->i_number, (intmax_t)lbn, inum); #endif if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); return (1); } if (lbn >= NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", "Copyonremove: snapino ", ip->i_number, (intmax_t)lbn, "for inum", inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; sn = devvp->v_rdev->si_snapdata; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { if ((ip->i_flags & SF_SNAPSHOT) == 0) { reason = "non-snapshot"; } else { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td); (void)ffs_syncvnode(vp, MNT_WAIT); } printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the * snapshots to share. In either case, acquire the snapshot * lock and give up our original private lock. */ VI_LOCK(devvp); if (sn != NULL) { VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; } else { VI_UNLOCK(devvp); sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; devvp->v_rdev->si_snapdata = sn; } lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(vp), td); transferlockers(&vp->v_lock, vp->v_vnlock); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %d already on list", ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp, 0, td); } /* * No usable snapshots found. */ if (vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp, 0, td); return; } MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp, 0, td); FREE(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp, 0, td); VI_LOCK(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; sn = devvp->v_rdev->si_snapdata; VI_LOCK(devvp); while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) { vp = ITOV(xp); vp->v_vnlock = &vp->v_lock; TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; if (xp->i_effnlink > 0) { VI_UNLOCK(devvp); vrele(vp); VI_LOCK(devvp); } } devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; VI_UNLOCK(devvp); if (sn->sn_blklist != NULL) { FREE(sn->sn_blklist, M_UFSMNT); sn->sn_blklist = NULL; sn->sn_listsize = 0; } lockdestroy(&sn->sn_lock); free(sn, M_UFSMNT); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = 0; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; if ((VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ip->i_fs; lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) { VI_UNLOCK(devvp); if (bp->b_runningbufspace) atomic_add_int(&runningbufspace, bp->b_runningbufspace); return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef DIAGNOSTIC if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %d lbn %jd for ", ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %d", VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ if (bp->b_runningbufspace) atomic_add_int(&runningbufspace, bp->b_runningbufspace); return (error); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(vp, bp, lbn) struct vnode *vp; struct buf *bp; ufs2_daddr_t lbn; { struct inode *ip = VTOI(vp); struct bio *bip; bip = g_alloc_bio(); bip->bio_cmd = BIO_READ; bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); bip->bio_data = bp->b_data; bip->bio_length = bp->b_bcount; g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); do msleep(bip, NULL, PRIBIO, "snaprdb", hz/10); while (!(bip->bio_flags & BIO_DONE)); bp->b_error = bip->bio_error; g_destroy_bio(bip); return (bp->b_error); } #endif Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 154151) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 154152) @@ -1,1740 +1,1744 @@ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_sbupdate(struct ufsmount *, int); static int ffs_reload(struct mount *, struct thread *); static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_vptofh_t ffs_vptofh; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_vptofh = ffs_vptofh, }; VFS_SET(ufs_vfsops, ufs, 0); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, }; static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr", "clusterw", "exec", "errmsg", "export", "force", "from", "multilabel", "snapshot", "suid", "suiddir", "symfollow", "sync", "update", "union", NULL }; static int ffs_mount(struct mount *mp, struct thread *td) { struct vnode *devvp; struct ufsmount *ump = 0; struct fs *fs; int error, flags; mode_t accessmode; struct nameidata ndp; struct export_args export; char *fspec; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mp->mnt_flag |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0) mp->mnt_flag |= MNT_ASYNC; if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0) mp->mnt_flag |= MNT_FORCE; if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0) mp->mnt_flag |= MNT_MULTILABEL; if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0) mp->mnt_flag &= ~MNT_ASYNC; if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0) mp->mnt_flag |= MNT_NOATIME; if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0) mp->mnt_flag |= MNT_NOCLUSTERR; if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0) mp->mnt_flag |= MNT_NOCLUSTERW; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) mp->mnt_flag |= MNT_SNAPSHOT; if (vfs_getopt(mp->mnt_optnew, "update", NULL, NULL) == 0) mp->mnt_flag |= MNT_UPDATE; export.ex_root = -2; /* DEFAULT_ROOTID */ if (mp->mnt_flag & MNT_RDONLY) export.ex_flags = MNT_EXRDONLY; else export.ex_flags = 0; /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); /* * Flush any dirty data. */ if ((error = ffs_sync(mp, MNT_WAIT, td)) != 0) { vn_finished_write(mp); return (error); } /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mp->mnt_flag & MNT_SOFTDEP) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vn_finished_write(mp); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: %s: blocks %jd files %d\n", fs->fs_fsmnt, "update error", (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vn_finished_write(mp); return (error); } vn_finished_write(mp); DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); fs->fs_ronly = 1; mp->mnt_flag |= MNT_RDONLY; } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ if (suser(td)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td)) != 0) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); } fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not %s\n", fs->fs_fsmnt, "properly dismounted"); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } DROP_GIANT(); g_topology_lock(); /* * If we're the root device, we may not have an E count * yet, get it now. */ if (ump->um_cp->ace == 0) error = g_access(ump->um_cp, 0, 1, 1); else error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; mp->mnt_flag &= ~MNT_RDONLY; fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT)) != 0) { vn_finished_write(mp); return (error); } /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (mp->mnt_flag & MNT_SOFTDEP) mp->mnt_flag &= ~MNT_ASYNC; /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) mp->mnt_flag |= MNT_ACLS; /* * If not updating name, process export requests. */ error = 0; if (vfs_getopt(mp->mnt_optnew, "export", NULL, NULL) == 0) { error = vfs_copyopt(mp->mnt_optnew, "export", &export, sizeof export); } if (error == 0 && export.ex_flags != 0) return (vfs_export(mp, &export)); /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp)) != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ if (suser(td)) { accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){ vput(devvp); return (error); } } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } } vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof args.export); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ffs_reload(struct mount *mp, struct thread *td) { - struct vnode *vp, *nvp, *devvp; + struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, size, error; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); ump = VFSTOUFS(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if (vinvalbuf(devvp, 0, td, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp, 0, td); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; /* The file system is still read-only. */ newfs->fs_ronly = 1; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: reload pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { lp = fs->fs_maxcluster; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } loop: MNT_ILOCK(mp); - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + MNT_VNODE_FOREACH_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, td, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0, td); vrele(vp); + MNT_VNODE_FOREACH_ABORT(mp, mvp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; struct cdev *dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, size, ronly; int32_t *lp; struct ucred *cred; struct g_consumer *cp; dev = devvp->v_rdev; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); /* * If we are a root mount, drop the E flag so fsck can do its magic. * We will pick it up again when we remount R/W. */ if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS)) error = g_access(cp, 0, 0, -1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return (error); if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; devvp->v_bufobj.bo_private = cp; devvp->v_bufobj.bo_ops = &ffs_ops; bp = NULL; ump = NULL; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indicies */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: mount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || vfs_getvfs(&mp->mnt_stat.f_fsid)) vfs_getnewfsid(mp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; mp->mnt_flag |= MNT_LOCAL; if ((fs->fs_flags & FS_MULTILABEL) != 0) #ifdef MAC mp->mnt_flag |= MNT_MULTILABEL; #else printf( "WARNING: %s: multilabel flag on fs but no MAC support\n", fs->fs_fsmnt); #endif if ((fs->fs_flags & FS_ACLS) != 0) #ifdef UFS_ACL mp->mnt_flag |= MNT_ACLS; #else printf( "WARNING: %s: ACLs flag on fs but no ACLs support\n", fs->fs_fsmnt); #endif ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); if( mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT); } /* * Initialize filesystem stat information in mount struct. */ #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ #ifndef QUOTA mp->mnt_kern_flag |= MNTK_MPSAFE; #endif return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { mtx_destroy(UFS_MTX(ump)); free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } #include static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags; flags = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("ffs_unmount: ufs_extattr_stop returned %d\n", error); } else { ufs_extattr_uepm_destroy(&ump->um_extattr); } #endif if (mp->mnt_flag & MNT_SOFTDEP) { if ((error = softdep_flushfiles(mp, flags, td)) != 0) return (error); } else { if ((error = ffs_flushfiles(mp, flags, td)) != 0) return (error); } fs = ump->um_fs; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: unmount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT); if (error) { fs->fs_clean = 0; return (error); } } DROP_GIANT(); g_topology_lock(); g_vfs_close(ump->um_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); mtx_destroy(UFS_MTX(ump)); free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int error; ump = VFSTOUFS(mp); #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP) continue; quotaoff(td, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Flush all the files. */ if ((error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0, td); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = NAME_MAX; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ffs_sync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { - struct vnode *nvp, *vp, *devvp; + struct vnode *mvp, *vp, *devvp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, wait, lockreq, allerror = 0; struct bufobj *bo; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_sync: rofs mod"); } /* * Write back each (modified) inode. */ wait = 0; lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_WAIT) { wait = 1; lockreq = LK_EXCLUSIVE; } lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; MNT_ILOCK(mp); loop: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { /* * Depend on the mntvnode_slock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0)) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); if ((error = vget(vp, lockreq, td)) != 0) { MNT_ILOCK(mp); - if (error == ENOENT || error == ENOLCK) + if (error == ENOENT || error == ENOLCK) { + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; + } continue; } if ((error = ffs_syncvnode(vp, waitfor)) != 0) allerror = error; vput(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) { MNT_ILOCK(mp); goto loop; } } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; VI_LOCK(devvp); bo = &devvp->v_bufobj; if (waitfor != MNT_LAZY && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td); if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(devvp, 0, td); if (allerror == 0 && waitfor == MNT_WAIT) { MNT_ILOCK(mp); goto loop; } } else VI_UNLOCK(devvp); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; struct cdev *dev; int error; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); dev = ump->um_dev; fs = ump->um_fs; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ffs_sync() if a sync happens to fire right then, * which will cause a panic because ffs_sync() blindly * dereferences vp->v_data (as well it should). */ ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if (fs->fs_magic == FS_UFS1_MAGIC) error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp); else error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } /* * FFS supports recursive and shared locking. */ vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_fs = fs; ip->i_dev = dev; ip->i_number = ino; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (ip->i_ump->um_fstype == UFS1) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if (ip->i_ump->um_fstype == UFS1) error = ufs_vinit(mp, &ffs_fifoops1, &vp); else error = ufs_vinit(mp, &ffs_fifoops2, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = arc4random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); } } /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_din1->di_ouid; /* XXX */ ip->i_gid = ip->i_din1->di_ogid; /* XXX */ } /* XXX */ #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_associate_vnode_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_fhtovp(mp, ufhp, vpp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ffs_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { struct inode *ip; struct ufid *ufhp; ip = VTOI(vp); ufhp = (struct ufid *)fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ static int ffs_sbupdate(mp, waitfor) struct ufsmount *mp; int waitfor; { struct fs *fs = mp->um_fs; struct buf *sbbp; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE)) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) { brelse(sbbp); return (allerror); } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } fs->fs_fmod = 0; fs->fs_time = time_second; bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* Grab an extra reference to be dropped by the bufdone() below. */ bufobj_wrefl(bufobj); BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); #ifdef SOFTUPDATES if (LIST_FIRST(&bp->b_dep) != NULL) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { int oldflags, s; struct buf *newbp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); s = splbio(); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); splx(s); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } BO_UNLOCK(bp->b_bufobj); /* Mark the buffer clean */ bundirty(bp); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize); /* * set it to be identical to the old block. We have to * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; bgetvp(bp->b_vp, newbp); BO_UNLOCK(bp->b_bufobj); newbp->b_bufobj = &bp->b_vp->v_bufobj; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; #ifdef SOFTUPDATES /* move over the dependencies */ if (LIST_FIRST(&bp->b_dep) != NULL) softdep_move_dependencies(bp, newbp); #endif /* * Initiate write on the copy, release the original to * the B_LOCKED queue so that it cannot go away until * the background write completes. If not locked it could go * away and then be reconstituted while it was being written. * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ bqrelse(bp); bp = newbp; } /* Let the normal bufwrite do the rest for us */ return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; int error; vp = bo->__bo_vnode; if (bp->b_iocmd == BIO_WRITE) { #ifdef SOFTUPDATES if (LIST_FIRST(&bp->b_dep) != NULL) buf_start(bp); #endif if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); bp->b_flags &= ~B_VALIDSUSPWRT; if ((vp->v_vflag & VV_COPYONWRITE) && vp->v_rdev->si_snapdata != NULL && (error = (ffs_copyonwrite)(vp, bp)) != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } g_vfs_strategy(bo, bp); } Index: head/sys/ufs/ufs/ufs_quota.c =================================================================== --- head/sys/ufs/ufs/ufs_quota.c (revision 154151) +++ head/sys/ufs/ufs/ufs_quota.c (revision 154152) @@ -1,1041 +1,1047 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_security_bsd); static int unprivileged_get_quota = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW, &unprivileged_get_quota, 0, "Unprivileged processes may retrieve quotas for other uids and gids"); static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries"); /* * Quota name to error message mapping. */ static char *quotatypes[] = INITQFNAMES; static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int); static int chkiqchg(struct inode *, ino_t, struct ucred *, int); static int dqget(struct vnode *, u_long, struct ufsmount *, int, struct dquot **); static int dqsync(struct vnode *, struct dquot *); static void dqflush(struct vnode *); #ifdef DIAGNOSTIC static void dqref(struct dquot *); static void chkdquot(struct inode *); #endif /* * Set up the quotas for an inode. * * This routine completely defines the semantics of quotas. * If other criterion want to be used to establish quotas, the * MAXQUOTAS value in quotas.h should be increased, and the * additional dquots set up here. */ int getinoquota(ip) struct inode *ip; { struct ufsmount *ump; struct vnode *vp = ITOV(ip); int error; ump = VFSTOUFS(vp->v_mount); /* * Set up the user quota based on file uid. * EINVAL means that quotas are not enabled. */ if (ip->i_dquot[USRQUOTA] == NODQUOT && (error = dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && error != EINVAL) return (error); /* * Set up the group quota based on file gid. * EINVAL means that quotas are not enabled. */ if (ip->i_dquot[GRPQUOTA] == NODQUOT && (error = dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && error != EINVAL) return (error); return (0); } /* * Update disk usage, and take corrective action. */ int chkdq(ip, change, cred, flags) struct inode *ip; ufs2_daddr_t change; struct ucred *cred; int flags; { struct dquot *dq; ufs2_daddr_t ncurblocks; int i, error; #ifdef DIAGNOSTIC if ((flags & CHOWN) == 0) chkdquot(ip); #endif if (change == 0) return (0); if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; while (dq->dq_flags & DQ_LOCK) { dq->dq_flags |= DQ_WANT; (void) tsleep(dq, PINOD+1, "chkdq1", 0); } ncurblocks = dq->dq_curblocks + change; if (ncurblocks >= 0) dq->dq_curblocks = ncurblocks; else dq->dq_curblocks = 0; dq->dq_flags &= ~DQ_BLKS; dq->dq_flags |= DQ_MOD; } return (0); } if ((flags & FORCE) == 0 && suser_cred(cred, 0)) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; error = chkdqchg(ip, change, cred, i); if (error) return (error); } } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; while (dq->dq_flags & DQ_LOCK) { dq->dq_flags |= DQ_WANT; (void) tsleep(dq, PINOD+1, "chkdq2", 0); } /* Reset timer when crossing soft limit */ if (dq->dq_curblocks + change >= dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_btime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[i]; dq->dq_curblocks += change; dq->dq_flags |= DQ_MOD; } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkdqchg(ip, change, cred, type) struct inode *ip; ufs2_daddr_t change; struct ucred *cred; int type; { struct dquot *dq = ip->i_dquot[type]; ufs2_daddr_t ncurblocks = dq->dq_curblocks + change; /* * If user would exceed their hard limit, disallow space allocation. */ if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { if ((dq->dq_flags & DQ_BLKS) == 0 && ip->i_uid == cred->cr_uid) { uprintf("\n%s: write failed, %s disk limit reached\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type]); dq->dq_flags |= DQ_BLKS; } return (EDQUOT); } /* * If user is over their soft limit for too long, disallow space * allocation. Reset time limit as they cross their soft limit. */ if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { if (dq->dq_curblocks < dq->dq_bsoftlimit) { dq->dq_btime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type]; if (ip->i_uid == cred->cr_uid) uprintf("\n%s: warning, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "disk quota exceeded"); return (0); } if (time_second > dq->dq_btime) { if ((dq->dq_flags & DQ_BLKS) == 0 && ip->i_uid == cred->cr_uid) { uprintf("\n%s: write failed, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "disk quota exceeded for too long"); dq->dq_flags |= DQ_BLKS; } return (EDQUOT); } } return (0); } /* * Check the inode limit, applying corrective action. */ int chkiq(ip, change, cred, flags) struct inode *ip; ino_t change; struct ucred *cred; int flags; { struct dquot *dq; ino_t ncurinodes; int i, error; #ifdef DIAGNOSTIC if ((flags & CHOWN) == 0) chkdquot(ip); #endif if (change == 0) return (0); /* XXX: change is unsigned */ if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; while (dq->dq_flags & DQ_LOCK) { dq->dq_flags |= DQ_WANT; (void) tsleep(dq, PINOD+1, "chkiq1", 0); } ncurinodes = dq->dq_curinodes + change; /* XXX: ncurinodes is unsigned */ if (ncurinodes >= 0) dq->dq_curinodes = ncurinodes; else dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; } return (0); } if ((flags & FORCE) == 0 && suser_cred(cred, 0)) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; error = chkiqchg(ip, change, cred, i); if (error) return (error); } } for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; while (dq->dq_flags & DQ_LOCK) { dq->dq_flags |= DQ_WANT; (void) tsleep(dq, PINOD+1, "chkiq2", 0); } /* Reset timer when crossing soft limit */ if (dq->dq_curinodes + change >= dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_itime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[i]; dq->dq_curinodes += change; dq->dq_flags |= DQ_MOD; } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkiqchg(ip, change, cred, type) struct inode *ip; ino_t change; struct ucred *cred; int type; { struct dquot *dq = ip->i_dquot[type]; ino_t ncurinodes = dq->dq_curinodes + change; /* * If user would exceed their hard limit, disallow inode allocation. */ if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { if ((dq->dq_flags & DQ_INODS) == 0 && ip->i_uid == cred->cr_uid) { uprintf("\n%s: write failed, %s inode limit reached\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type]); dq->dq_flags |= DQ_INODS; } return (EDQUOT); } /* * If user is over their soft limit for too long, disallow inode * allocation. Reset time limit as they cross their soft limit. */ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { if (dq->dq_curinodes < dq->dq_isoftlimit) { dq->dq_itime = time_second + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type]; if (ip->i_uid == cred->cr_uid) uprintf("\n%s: warning, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "inode quota exceeded"); return (0); } if (time_second > dq->dq_itime) { if ((dq->dq_flags & DQ_INODS) == 0 && ip->i_uid == cred->cr_uid) { uprintf("\n%s: write failed, %s %s\n", ITOV(ip)->v_mount->mnt_stat.f_mntonname, quotatypes[type], "inode quota exceeded for too long"); dq->dq_flags |= DQ_INODS; } return (EDQUOT); } } return (0); } #ifdef DIAGNOSTIC /* * On filesystems with quotas enabled, it is an error for a file to change * size and not to have a dquot structure associated with it. */ static void chkdquot(ip) struct inode *ip; { struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount); int i; for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP || (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) continue; if (ip->i_dquot[i] == NODQUOT) { vprint("chkdquot: missing dquot", ITOV(ip)); panic("chkdquot: missing dquot"); } } } #endif /* * Code to process quotactl commands. */ /* * Q_QUOTAON - set up a quota file for a particular filesystem. */ int quotaon(td, mp, type, fname) struct thread *td; struct mount *mp; int type; void *fname; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *vp, **vpp; - struct vnode *nvp; + struct vnode *mvp; struct dquot *dq; int error, flags; struct nameidata nd; error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); vpp = &ump->um_quotas[type]; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td); flags = FREAD | FWRITE; error = vn_open(&nd, &flags, 0, -1); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; VOP_UNLOCK(vp, 0, td); if (vp->v_type != VREG) { (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); return (EACCES); } if (*vpp != vp) quotaoff(td, mp, type); ump->um_qflags[type] |= QTF_OPENING; mp->mnt_flag |= MNT_QUOTA; ASSERT_VOP_LOCKED(vp, "quotaon"); vp->v_vflag |= VV_SYSTEM; *vpp = vp; /* * Save the credential of the process that turned on quotas. * Set up the time limits for this quota. */ ump->um_cred[type] = crhold(td->td_ucred); ump->um_btime[type] = MAX_DQ_TIME; ump->um_itime[type] = MAX_IQ_TIME; if (dqget(NULLVP, 0, ump, type, &dq) == 0) { if (dq->dq_btime > 0) ump->um_btime[type] = dq->dq_btime; if (dq->dq_itime > 0) ump->um_itime[type] = dq->dq_itime; dqrele(NULLVP, dq); } /* * Search vnodes associated with this mount point, * adding references to quota file being opened. * NB: only need to add dquot's for inodes being modified. */ MNT_ILOCK(mp); again: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_ILOCK(mp); + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto again; } if (vp->v_type == VNON || vp->v_writecount == 0) { VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); continue; } error = getinoquota(VTOI(vp)); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); - if (error) + if (error) { + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); break; + } } MNT_IUNLOCK(mp); ump->um_qflags[type] &= ~QTF_OPENING; if (error) quotaoff(td, mp, type); return (error); } /* * Q_QUOTAOFF - turn off disk quotas for a filesystem. */ int quotaoff(td, mp, type) struct thread *td; struct mount *mp; int type; { struct vnode *vp; - struct vnode *qvp, *nvp; + struct vnode *qvp, *mvp; struct ufsmount *ump = VFSTOUFS(mp); struct dquot *dq; struct inode *ip; int error; error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); if ((qvp = ump->um_quotas[type]) == NULLVP) return (0); ump->um_qflags[type] |= QTF_CLOSING; /* * Search vnodes associated with this mount point, * deleting any references to quota file being closed. */ MNT_ILOCK(mp); again: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (vp->v_type == VNON) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_ILOCK(mp); + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto again; } ip = VTOI(vp); dq = ip->i_dquot[type]; ip->i_dquot[type] = NODQUOT; dqrele(vp, dq); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); dqflush(qvp); ASSERT_VOP_LOCKED(qvp, "quotaoff"); qvp->v_vflag &= ~VV_SYSTEM; error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td); ump->um_quotas[type] = NULLVP; crfree(ump->um_cred[type]); ump->um_cred[type] = NOCRED; ump->um_qflags[type] &= ~QTF_CLOSING; for (type = 0; type < MAXQUOTAS; type++) if (ump->um_quotas[type] != NULLVP) break; if (type == MAXQUOTAS) mp->mnt_flag &= ~MNT_QUOTA; return (error); } /* * Q_GETQUOTA - return current values in a dqblk structure. */ int getquota(td, mp, id, type, addr) struct thread *td; struct mount *mp; u_long id; int type; void *addr; { struct dquot *dq; int error; switch (type) { case USRQUOTA: if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) { error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); } break; case GRPQUOTA: if (!groupmember(id, td->td_ucred) && !unprivileged_get_quota) { error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); } break; default: return (EINVAL); } error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq); if (error) return (error); error = copyout(&dq->dq_dqb, addr, sizeof (struct dqblk)); dqrele(NULLVP, dq); return (error); } /* * Q_SETQUOTA - assign an entire dqblk structure. */ int setquota(td, mp, id, type, addr) struct thread *td; struct mount *mp; u_long id; int type; void *addr; { struct dquot *dq; struct dquot *ndq; struct ufsmount *ump = VFSTOUFS(mp); struct dqblk newlim; int error; error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); error = copyin(addr, &newlim, sizeof (struct dqblk)); if (error) return (error); error = dqget(NULLVP, id, ump, type, &ndq); if (error) return (error); dq = ndq; while (dq->dq_flags & DQ_LOCK) { dq->dq_flags |= DQ_WANT; (void) tsleep(dq, PINOD+1, "setqta", 0); } /* * Copy all but the current values. * Reset time limit if previously had no soft limit or were * under it, but now have a soft limit and are over it. */ newlim.dqb_curblocks = dq->dq_curblocks; newlim.dqb_curinodes = dq->dq_curinodes; if (dq->dq_id != 0) { newlim.dqb_btime = dq->dq_btime; newlim.dqb_itime = dq->dq_itime; } if (newlim.dqb_bsoftlimit && dq->dq_curblocks >= newlim.dqb_bsoftlimit && (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) newlim.dqb_btime = time_second + ump->um_btime[type]; if (newlim.dqb_isoftlimit && dq->dq_curinodes >= newlim.dqb_isoftlimit && (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) newlim.dqb_itime = time_second + ump->um_itime[type]; dq->dq_dqb = newlim; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; else dq->dq_flags &= ~DQ_FAKE; dq->dq_flags |= DQ_MOD; dqrele(NULLVP, dq); return (0); } /* * Q_SETUSE - set current inode and block usage. */ int setuse(td, mp, id, type, addr) struct thread *td; struct mount *mp; u_long id; int type; void *addr; { struct dquot *dq; struct ufsmount *ump = VFSTOUFS(mp); struct dquot *ndq; struct dqblk usage; int error; error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); error = copyin(addr, &usage, sizeof (struct dqblk)); if (error) return (error); error = dqget(NULLVP, id, ump, type, &ndq); if (error) return (error); dq = ndq; while (dq->dq_flags & DQ_LOCK) { dq->dq_flags |= DQ_WANT; (void) tsleep(dq, PINOD+1, "setuse", 0); } /* * Reset time limit if have a soft limit and were * previously under it, but are now over it. */ if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && usage.dqb_curblocks >= dq->dq_bsoftlimit) dq->dq_btime = time_second + ump->um_btime[type]; if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && usage.dqb_curinodes >= dq->dq_isoftlimit) dq->dq_itime = time_second + ump->um_itime[type]; dq->dq_curblocks = usage.dqb_curblocks; dq->dq_curinodes = usage.dqb_curinodes; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; dqrele(NULLVP, dq); return (0); } /* * Q_SYNC - sync quota files to disk. */ int qsync(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct thread *td = curthread; /* XXX */ - struct vnode *vp, *nvp; + struct vnode *vp, *mvp; struct dquot *dq; int i, error; /* * Check if the mount point has any quotas. * If not, simply return. */ for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; if (i == MAXQUOTAS) return (0); /* * Search vnodes associated with this mount point, * synchronizing any modified dquot structures. */ MNT_ILOCK(mp); again: - MNT_VNODE_FOREACH(vp, mp, nvp) { + MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); MNT_IUNLOCK(mp); if (vp->v_type == VNON) { VI_UNLOCK(vp); MNT_ILOCK(mp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { MNT_ILOCK(mp); - if (error == ENOENT) + if (error == ENOENT) { + MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto again; + } continue; } for (i = 0; i < MAXQUOTAS; i++) { dq = VTOI(vp)->i_dquot[i]; if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) dqsync(vp, dq); } vput(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Code pertaining to management of the in-core dquot data structures. */ #define DQHASH(dqvp, id) \ (&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash]) static LIST_HEAD(dqhash, dquot) *dqhashtbl; static u_long dqhash; /* * Dquot free list. */ #define DQUOTINC 5 /* minimum free dquots desired */ static TAILQ_HEAD(dqfreelist, dquot) dqfreelist; static long numdquot, desireddquot = DQUOTINC; /* * Initialize the quota system. */ void dqinit() { dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); TAILQ_INIT(&dqfreelist); } /* * Shut down the quota system. */ void dquninit() { struct dquot *dq; hashdestroy(dqhashtbl, M_DQUOT, dqhash); while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) { TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); free(dq, M_DQUOT); } } /* * Obtain a dquot structure for the specified identifier and quota file * reading the information from the file if necessary. */ static int dqget(vp, id, ump, type, dqp) struct vnode *vp; u_long id; struct ufsmount *ump; int type; struct dquot **dqp; { struct thread *td = curthread; /* XXX */ struct dquot *dq; struct dqhash *dqh; struct vnode *dqvp; struct iovec aiov; struct uio auio; int error; dqvp = ump->um_quotas[type]; if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { *dqp = NODQUOT; return (EINVAL); } /* * Check the cache first. */ dqh = DQHASH(dqvp, id); LIST_FOREACH(dq, dqh, dq_hash) { if (dq->dq_id != id || dq->dq_ump->um_quotas[dq->dq_type] != dqvp) continue; /* * Cache hit with no references. Take * the structure off the free list. */ if (dq->dq_cnt == 0) TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); DQREF(dq); *dqp = dq; return (0); } /* * Not in cache, allocate a new one. */ if (TAILQ_FIRST(&dqfreelist) == NODQUOT && numdquot < MAXQUOTAS * desiredvnodes) desireddquot += DQUOTINC; if (numdquot < desireddquot) { dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT, M_WAITOK | M_ZERO); numdquot++; } else { if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) { tablefull("dquot"); *dqp = NODQUOT; return (EUSERS); } if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) panic("dqget: free dquot isn't"); TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); if (dq->dq_ump != NULL) LIST_REMOVE(dq, dq_hash); } /* * Initialize the contents of the dquot structure. */ if (vp != dqvp) vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td); LIST_INSERT_HEAD(dqh, dq, dq_hash); DQREF(dq); dq->dq_flags = DQ_LOCK; dq->dq_id = id; dq->dq_ump = ump; dq->dq_type = type; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = &dq->dq_dqb; aiov.iov_len = sizeof (struct dqblk); auio.uio_resid = sizeof (struct dqblk); auio.uio_offset = (off_t)(id * sizeof (struct dqblk)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = (struct thread *)0; error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); if (auio.uio_resid == sizeof(struct dqblk) && error == 0) bzero(&dq->dq_dqb, sizeof(struct dqblk)); if (vp != dqvp) VOP_UNLOCK(dqvp, 0, td); if (dq->dq_flags & DQ_WANT) wakeup(dq); dq->dq_flags = 0; /* * I/O error in reading quota file, release * quota structure and reflect problem to caller. */ if (error) { LIST_REMOVE(dq, dq_hash); dqrele(vp, dq); *dqp = NODQUOT; return (error); } /* * Check for no limit to enforce. * Initialize time values if necessary. */ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; if (dq->dq_id != 0) { if (dq->dq_btime == 0) dq->dq_btime = time_second + ump->um_btime[type]; if (dq->dq_itime == 0) dq->dq_itime = time_second + ump->um_itime[type]; } *dqp = dq; return (0); } #ifdef DIAGNOSTIC /* * Obtain a reference to a dquot. */ static void dqref(dq) struct dquot *dq; { dq->dq_cnt++; } #endif /* * Release a reference to a dquot. */ void dqrele(vp, dq) struct vnode *vp; struct dquot *dq; { if (dq == NODQUOT) return; if (dq->dq_cnt > 1) { dq->dq_cnt--; return; } if (dq->dq_flags & DQ_MOD) (void) dqsync(vp, dq); if (--dq->dq_cnt > 0) return; TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); } /* * Update the disk quota in the quota file. */ static int dqsync(vp, dq) struct vnode *vp; struct dquot *dq; { struct thread *td = curthread; /* XXX */ struct vnode *dqvp; struct iovec aiov; struct uio auio; int error; if (dq == NODQUOT) panic("dqsync: dquot"); if ((dq->dq_flags & DQ_MOD) == 0) return (0); if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) panic("dqsync: file"); (void) vn_write_suspend_wait(dqvp, NULL, V_WAIT); if (vp != dqvp) vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td); while (dq->dq_flags & DQ_LOCK) { dq->dq_flags |= DQ_WANT; (void) tsleep(dq, PINOD+2, "dqsync", 0); if ((dq->dq_flags & DQ_MOD) == 0) { if (vp != dqvp) VOP_UNLOCK(dqvp, 0, td); return (0); } } dq->dq_flags |= DQ_LOCK; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = &dq->dq_dqb; aiov.iov_len = sizeof (struct dqblk); auio.uio_resid = sizeof (struct dqblk); auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = (struct thread *)0; error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); if (auio.uio_resid && error == 0) error = EIO; if (dq->dq_flags & DQ_WANT) wakeup(dq); dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT); if (vp != dqvp) VOP_UNLOCK(dqvp, 0, td); return (error); } /* * Flush all entries from the cache for a particular vnode. */ static void dqflush(vp) struct vnode *vp; { struct dquot *dq, *nextdq; struct dqhash *dqh; /* * Move all dquot's that used to refer to this quota * file off their hash chains (they will eventually * fall off the head of the free list and be re-used). */ for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) { for (dq = LIST_FIRST(dqh); dq; dq = nextdq) { nextdq = LIST_NEXT(dq, dq_hash); if (dq->dq_ump->um_quotas[dq->dq_type] != vp) continue; if (dq->dq_cnt) panic("dqflush: stray dquot"); LIST_REMOVE(dq, dq_hash); dq->dq_ump = (struct ufsmount *)0; } } } Index: head/usr.bin/fstat/fstat.c =================================================================== --- head/usr.bin/fstat/fstat.c (revision 154151) +++ head/usr.bin/fstat/fstat.c (revision 154152) @@ -1,946 +1,947 @@ /*- * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1988, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)fstat.c 8.3 (Berkeley) 5/2/95"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define _KERNEL #include #include #include #include #include #include #include #include #undef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fstat.h" #define TEXT -1 #define CDIR -2 #define RDIR -3 #define TRACE -4 #define MMAP -5 #define JDIR -6 DEVS *devs; #ifdef notdef struct nlist nl[] = { { "" }, }; #endif int fsflg, /* show files on same filesystem as file(s) argument */ pflg, /* show files open by a particular pid */ uflg; /* show files open by a particular (effective) user */ int checkfile; /* true if restricting to particular files or filesystems */ int nflg; /* (numerical) display f.s. and rdev as dev_t */ int vflg; /* display errors in locating kernel data objects etc... */ int mflg; /* include memory-mapped files */ struct file **ofiles; /* buffer of pointers to file structures */ int maxfiles; #define ALLOC_OFILES(d) \ if ((d) > maxfiles) { \ free(ofiles); \ ofiles = malloc((d) * sizeof(struct file *)); \ if (ofiles == NULL) { \ err(1, NULL); \ } \ maxfiles = (d); \ } char *memf, *nlistf; kvm_t *kd; static void fstat_kvm(int, int); static void fstat_sysctl(int, int); void dofiles(struct kinfo_proc *kp); void dommap(struct kinfo_proc *kp); void vtrans(struct vnode *vp, int i, int flag); int ufs_filestat(struct vnode *vp, struct filestat *fsp); int nfs_filestat(struct vnode *vp, struct filestat *fsp); int devfs_filestat(struct vnode *vp, struct filestat *fsp); char *getmnton(struct mount *m); void pipetrans(struct pipe *pi, int i, int flag); void socktrans(struct socket *sock, int i); void getinetproto(int number); int getfname(const char *filename); void usage(void); char *kdevtoname(struct cdev *dev); int main(int argc, char **argv) { struct passwd *passwd; int arg, ch, what; arg = 0; what = KERN_PROC_ALL; nlistf = memf = NULL; while ((ch = getopt(argc, argv, "fmnp:u:vN:M:")) != -1) switch((char)ch) { case 'f': fsflg = 1; break; case 'M': memf = optarg; break; case 'N': nlistf = optarg; break; case 'm': mflg = 1; break; case 'n': nflg = 1; break; case 'p': if (pflg++) usage(); if (!isdigit(*optarg)) { warnx("-p requires a process id"); usage(); } what = KERN_PROC_PID; arg = atoi(optarg); break; case 'u': if (uflg++) usage(); if (!(passwd = getpwnam(optarg))) errx(1, "%s: unknown uid", optarg); what = KERN_PROC_UID; arg = passwd->pw_uid; break; case 'v': vflg = 1; break; case '?': default: usage(); } if (*(argv += optind)) { for (; *argv; ++argv) { if (getfname(*argv)) checkfile = 1; } if (!checkfile) /* file(s) specified, but none accessable */ exit(1); } if (fsflg && !checkfile) { /* -f with no files means use wd */ if (getfname(".") == 0) exit(1); checkfile = 1; } if (memf != NULL) fstat_kvm(what, arg); else fstat_sysctl(what, arg); exit(0); } static void print_header(void) { if (nflg) printf("%s", "USER CMD PID FD DEV INUM MODE SZ|DV R/W"); else printf("%s", "USER CMD PID FD MOUNT INUM MODE SZ|DV R/W"); if (checkfile && fsflg == 0) printf(" NAME\n"); else putchar('\n'); } static void fstat_kvm(int what, int arg) { struct kinfo_proc *p, *plast; char buf[_POSIX2_LINE_MAX]; int cnt; ALLOC_OFILES(256); /* reserve space for file pointers */ /* * Discard setgid privileges if not the running kernel so that bad * guys can't print interesting stuff from kernel memory. */ if (nlistf != NULL || memf != NULL) setgid(getgid()); if ((kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, buf)) == NULL) errx(1, "%s", buf); setgid(getgid()); #ifdef notdef if (kvm_nlist(kd, nl) != 0) errx(1, "no namelist: %s", kvm_geterr(kd)); #endif if ((p = kvm_getprocs(kd, what, arg, &cnt)) == NULL) errx(1, "%s", kvm_geterr(kd)); print_header(); for (plast = &p[cnt]; p < plast; ++p) { if (p->ki_stat == SZOMB) continue; dofiles(p); if (mflg) dommap(p); } } static void fstat_sysctl(int what, int arg) { /* not yet implemented */ fstat_kvm(what, arg); } const char *Uname, *Comm; int Pid; #define PREFIX(i) printf("%-8.8s %-10s %5d", Uname, Comm, Pid); \ switch(i) { \ case TEXT: \ printf(" text"); \ break; \ case CDIR: \ printf(" wd"); \ break; \ case RDIR: \ printf(" root"); \ break; \ case TRACE: \ printf(" tr"); \ break; \ case MMAP: \ printf(" mmap"); \ break; \ case JDIR: \ printf(" jail"); \ break; \ default: \ printf(" %4d", i); \ break; \ } /* * print open files attributed to this process */ void dofiles(struct kinfo_proc *kp) { int i; struct file file; struct filedesc filed; Uname = user_from_uid(kp->ki_uid, 0); Pid = kp->ki_pid; Comm = kp->ki_comm; if (kp->ki_fd == NULL) return; if (!KVM_READ(kp->ki_fd, &filed, sizeof (filed))) { dprintf(stderr, "can't read filedesc at %p for pid %d\n", (void *)kp->ki_fd, Pid); return; } /* * root directory vnode, if one */ if (filed.fd_rdir) vtrans(filed.fd_rdir, RDIR, FREAD); /* * current working directory vnode */ vtrans(filed.fd_cdir, CDIR, FREAD); /* * jail root, if any. */ if (filed.fd_jdir) vtrans(filed.fd_jdir, JDIR, FREAD); /* * ktrace vnode, if one */ if (kp->ki_tracep) vtrans(kp->ki_tracep, TRACE, FREAD|FWRITE); /* * text vnode, if one */ if (kp->ki_textvp) vtrans(kp->ki_textvp, TEXT, FREAD); /* * open files */ #define FPSIZE (sizeof (struct file *)) #define MAX_LASTFILE (0x1000000) /* Sanity check on filed.fd_lastfile */ if (filed.fd_lastfile <= -1 || filed.fd_lastfile > MAX_LASTFILE) return; ALLOC_OFILES(filed.fd_lastfile+1); if (!KVM_READ(filed.fd_ofiles, ofiles, (filed.fd_lastfile+1) * FPSIZE)) { dprintf(stderr, "can't read file structures at %p for pid %d\n", (void *)filed.fd_ofiles, Pid); return; } for (i = 0; i <= filed.fd_lastfile; i++) { if (ofiles[i] == NULL) continue; if (!KVM_READ(ofiles[i], &file, sizeof (struct file))) { dprintf(stderr, "can't read file %d at %p for pid %d\n", i, (void *)ofiles[i], Pid); continue; } if (file.f_type == DTYPE_VNODE) vtrans(file.f_vnode, i, file.f_flag); else if (file.f_type == DTYPE_SOCKET) { if (checkfile == 0) socktrans(file.f_data, i); } #ifdef DTYPE_PIPE else if (file.f_type == DTYPE_PIPE) { if (checkfile == 0) pipetrans(file.f_data, i, file.f_flag); } #endif #ifdef DTYPE_FIFO else if (file.f_type == DTYPE_FIFO) { if (checkfile == 0) vtrans(file.f_vnode, i, file.f_flag); } #endif else { dprintf(stderr, "unknown file type %d for file %d of pid %d\n", file.f_type, i, Pid); } } } void dommap(struct kinfo_proc *kp) { vm_map_t map; struct vmspace vmspace; struct vm_map_entry entry; vm_map_entry_t entryp; struct vm_object object; vm_object_t objp; int prot, fflags; if (!KVM_READ(kp->ki_vmspace, &vmspace, sizeof(vmspace))) { dprintf(stderr, "can't read vmspace at %p for pid %d\n", (void *)kp->ki_vmspace, Pid); return; } map = &vmspace.vm_map; for (entryp = map->header.next; entryp != &kp->ki_vmspace->vm_map.header; entryp = entry.next) { if (!KVM_READ(entryp, &entry, sizeof(entry))) { dprintf(stderr, "can't read vm_map_entry at %p for pid %d\n", (void *)entryp, Pid); return; } if (entry.eflags & MAP_ENTRY_IS_SUB_MAP) continue; if ((objp = entry.object.vm_object) == NULL) continue; for (; objp; objp = object.backing_object) { if (!KVM_READ(objp, &object, sizeof(object))) { dprintf(stderr, "can't read vm_object at %p for pid %d\n", (void *)objp, Pid); return; } } prot = entry.protection; fflags = (prot & VM_PROT_READ ? FREAD : 0) | (prot & VM_PROT_WRITE ? FWRITE : 0); switch (object.type) { case OBJT_VNODE: vtrans((struct vnode *)object.handle, MMAP, fflags); break; default: break; } } } char * kdevtoname(struct cdev *dev) { struct cdev si; if (!KVM_READ(dev, &si, sizeof si)) return (NULL); return (strdup(si.__si_namebuf)); } void vtrans(struct vnode *vp, int i, int flag) { struct vnode vn; struct filestat fst; char rw[3], mode[15], tagstr[12], *tagptr; const char *badtype, *filename; filename = badtype = NULL; if (!KVM_READ(vp, &vn, sizeof (struct vnode))) { dprintf(stderr, "can't read vnode at %p for pid %d\n", (void *)vp, Pid); return; } if (!KVM_READ(&vp->v_tag, &tagptr, sizeof tagptr) || !KVM_READ(tagptr, tagstr, sizeof tagstr)) { dprintf(stderr, "can't read v_tag at %p for pid %d\n", (void *)vp, Pid); return; } tagstr[sizeof(tagstr) - 1] = '\0'; if (vn.v_type == VNON) badtype = "none"; else if (vn.v_type == VBAD) badtype = "bad"; else { if (!strcmp("ufs", tagstr)) { if (!ufs_filestat(&vn, &fst)) badtype = "error"; } else if (!strcmp("devfs", tagstr)) { if (!devfs_filestat(&vn, &fst)) badtype = "error"; } else if (!strcmp("nfs", tagstr)) { if (!nfs_filestat(&vn, &fst)) badtype = "error"; } else if (!strcmp("msdosfs", tagstr)) { if (!msdosfs_filestat(&vn, &fst)) badtype = "error"; } else if (!strcmp("isofs", tagstr)) { if (!isofs_filestat(&vn, &fst)) badtype = "error"; } else { static char unknown[32]; snprintf(unknown, sizeof unknown, "?(%s)", tagstr); badtype = unknown; } } if (checkfile) { int fsmatch = 0; DEVS *d; if (badtype) return; for (d = devs; d != NULL; d = d->next) if (d->fsid == fst.fsid) { fsmatch = 1; if (d->ino == fst.fileid) { filename = d->name; break; } } if (fsmatch == 0 || (filename == NULL && fsflg == 0)) return; } PREFIX(i); if (badtype) { (void)printf(" - - %10s -\n", badtype); return; } if (nflg) (void)printf(" %2d,%-2d", major(fst.fsid), minor(fst.fsid)); else (void)printf(" %-8s", getmnton(vn.v_mount)); if (nflg) (void)sprintf(mode, "%o", fst.mode); else strmode(fst.mode, mode); (void)printf(" %6ld %10s", fst.fileid, mode); switch (vn.v_type) { case VBLK: case VCHR: { char *name; name = kdevtoname(vn.v_rdev); if (nflg || !name) printf(" %2d,%-2d", major(fst.rdev), minor(fst.rdev)); else { printf(" %6s", name); free(name); } break; } default: printf(" %6lu", fst.size); } rw[0] = '\0'; if (flag & FREAD) strcat(rw, "r"); if (flag & FWRITE) strcat(rw, "w"); printf(" %2s", rw); if (filename && !fsflg) printf(" %s", filename); putchar('\n'); } int ufs_filestat(struct vnode *vp, struct filestat *fsp) { struct inode inode; if (!KVM_READ(VTOI(vp), &inode, sizeof (inode))) { dprintf(stderr, "can't read inode at %p for pid %d\n", (void *)VTOI(vp), Pid); return 0; } /* * The st_dev from stat(2) is a dev_t. These kernel structures * contain cdev pointers. We need to convert to dev_t to make * comparisons */ fsp->fsid = dev2udev(inode.i_dev); fsp->fileid = (long)inode.i_number; fsp->mode = (mode_t)inode.i_mode; fsp->size = (u_long)inode.i_size; #if should_be_but_is_hard /* XXX - need to load i_ump and i_din[12] from kernel memory */ if (inode.i_ump->um_fstype == UFS1) fsp->rdev = inode.i_din1->di_rdev; else fsp->rdev = inode.i_din2->di_rdev; #else fsp->rdev = 0; #endif return 1; } int devfs_filestat(struct vnode *vp, struct filestat *fsp) { struct devfs_dirent devfs_dirent; struct mount mount; struct vnode vnode; if (!KVM_READ(vp->v_data, &devfs_dirent, sizeof (devfs_dirent))) { dprintf(stderr, "can't read devfs_dirent at %p for pid %d\n", (void *)vp->v_data, Pid); return 0; } if (!KVM_READ(vp->v_mount, &mount, sizeof (mount))) { dprintf(stderr, "can't read mount at %p for pid %d\n", (void *)vp->v_mount, Pid); return 0; } if (!KVM_READ(devfs_dirent.de_vnode, &vnode, sizeof (vnode))) { dprintf(stderr, "can't read vnode at %p for pid %d\n", (void *)devfs_dirent.de_vnode, Pid); return 0; } fsp->fsid = (long)mount.mnt_stat.f_fsid.val[0]; fsp->fileid = devfs_dirent.de_inode; fsp->mode = (devfs_dirent.de_mode & ~S_IFMT) | S_IFCHR; fsp->size = 0; fsp->rdev = dev2udev(vnode.v_rdev); return 1; } int nfs_filestat(struct vnode *vp, struct filestat *fsp) { struct nfsnode nfsnode; mode_t mode; if (!KVM_READ(VTONFS(vp), &nfsnode, sizeof (nfsnode))) { dprintf(stderr, "can't read nfsnode at %p for pid %d\n", (void *)VTONFS(vp), Pid); return 0; } fsp->fsid = nfsnode.n_vattr.va_fsid; fsp->fileid = nfsnode.n_vattr.va_fileid; fsp->size = nfsnode.n_size; fsp->rdev = nfsnode.n_vattr.va_rdev; mode = (mode_t)nfsnode.n_vattr.va_mode; switch (vp->v_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; case VNON: case VBAD: + case VMARKER: return 0; }; fsp->mode = mode; return 1; } char * getmnton(struct mount *m) { static struct mount mount; static struct mtab { struct mtab *next; struct mount *m; char mntonname[MNAMELEN]; } *mhead = NULL; struct mtab *mt; for (mt = mhead; mt != NULL; mt = mt->next) if (m == mt->m) return (mt->mntonname); if (!KVM_READ(m, &mount, sizeof(struct mount))) { warnx("can't read mount table at %p", (void *)m); return (NULL); } if ((mt = malloc(sizeof (struct mtab))) == NULL) err(1, NULL); mt->m = m; bcopy(&mount.mnt_stat.f_mntonname[0], &mt->mntonname[0], MNAMELEN); mt->next = mhead; mhead = mt; return (mt->mntonname); } void pipetrans(struct pipe *pi, int i, int flag) { struct pipe pip; char rw[3]; PREFIX(i); /* fill in socket */ if (!KVM_READ(pi, &pip, sizeof(struct pipe))) { dprintf(stderr, "can't read pipe at %p\n", (void *)pi); goto bad; } printf("* pipe %8lx <-> %8lx", (u_long)pi, (u_long)pip.pipe_peer); printf(" %6d", (int)pip.pipe_buffer.cnt); rw[0] = '\0'; if (flag & FREAD) strcat(rw, "r"); if (flag & FWRITE) strcat(rw, "w"); printf(" %2s", rw); putchar('\n'); return; bad: printf("* error\n"); } void socktrans(struct socket *sock, int i) { static const char *stypename[] = { "unused", /* 0 */ "stream", /* 1 */ "dgram", /* 2 */ "raw", /* 3 */ "rdm", /* 4 */ "seqpak" /* 5 */ }; #define STYPEMAX 5 struct socket so; struct protosw proto; struct domain dom; struct inpcb inpcb; struct unpcb unpcb; int len; char dname[32]; PREFIX(i); /* fill in socket */ if (!KVM_READ(sock, &so, sizeof(struct socket))) { dprintf(stderr, "can't read sock at %p\n", (void *)sock); goto bad; } /* fill in protosw entry */ if (!KVM_READ(so.so_proto, &proto, sizeof(struct protosw))) { dprintf(stderr, "can't read protosw at %p", (void *)so.so_proto); goto bad; } /* fill in domain */ if (!KVM_READ(proto.pr_domain, &dom, sizeof(struct domain))) { dprintf(stderr, "can't read domain at %p\n", (void *)proto.pr_domain); goto bad; } if ((len = kvm_read(kd, (u_long)dom.dom_name, dname, sizeof(dname) - 1)) < 0) { dprintf(stderr, "can't read domain name at %p\n", (void *)dom.dom_name); dname[0] = '\0'; } else dname[len] = '\0'; if ((u_short)so.so_type > STYPEMAX) printf("* %s ?%d", dname, so.so_type); else printf("* %s %s", dname, stypename[so.so_type]); /* * protocol specific formatting * * Try to find interesting things to print. For tcp, the interesting * thing is the address of the tcpcb, for udp and others, just the * inpcb (socket pcb). For unix domain, its the address of the socket * pcb and the address of the connected pcb (if connected). Otherwise * just print the protocol number and address of the socket itself. * The idea is not to duplicate netstat, but to make available enough * information for further analysis. */ switch(dom.dom_family) { case AF_INET: case AF_INET6: getinetproto(proto.pr_protocol); if (proto.pr_protocol == IPPROTO_TCP ) { if (so.so_pcb) { if (kvm_read(kd, (u_long)so.so_pcb, (char *)&inpcb, sizeof(struct inpcb)) != sizeof(struct inpcb)) { dprintf(stderr, "can't read inpcb at %p\n", (void *)so.so_pcb); goto bad; } printf(" %lx", (u_long)inpcb.inp_ppcb); } } else if (so.so_pcb) printf(" %lx", (u_long)so.so_pcb); break; case AF_UNIX: /* print address of pcb and connected pcb */ if (so.so_pcb) { printf(" %lx", (u_long)so.so_pcb); if (kvm_read(kd, (u_long)so.so_pcb, (char *)&unpcb, sizeof(struct unpcb)) != sizeof(struct unpcb)){ dprintf(stderr, "can't read unpcb at %p\n", (void *)so.so_pcb); goto bad; } if (unpcb.unp_conn) { char shoconn[4], *cp; cp = shoconn; if (!(so.so_rcv.sb_state & SBS_CANTRCVMORE)) *cp++ = '<'; *cp++ = '-'; if (!(so.so_snd.sb_state & SBS_CANTSENDMORE)) *cp++ = '>'; *cp = '\0'; printf(" %s %lx", shoconn, (u_long)unpcb.unp_conn); } } break; default: /* print protocol number and socket address */ printf(" %d %lx", proto.pr_protocol, (u_long)sock); } printf("\n"); return; bad: printf("* error\n"); } /* * Read the cdev structure in the kernel in order to work out the * associated dev_t */ dev_t dev2udev(struct cdev *dev) { struct cdev_priv priv; struct cdev si; if (KVM_READ(dev, &si, sizeof si) && KVM_READ(si.si_priv, &priv, sizeof priv)) { return ((dev_t)priv.cdp_inode); } else { dprintf(stderr, "can't convert cdev *%p to a dev_t\n", dev); return -1; } } /* * getinetproto -- * print name of protocol number */ void getinetproto(int number) { static int isopen; struct protoent *pe; if (!isopen) setprotoent(++isopen); if ((pe = getprotobynumber(number)) != NULL) printf(" %s", pe->p_name); else printf(" %d", number); } int getfname(const char *filename) { struct stat statbuf; DEVS *cur; if (stat(filename, &statbuf)) { warn("%s", filename); return(0); } if ((cur = malloc(sizeof(DEVS))) == NULL) err(1, NULL); cur->next = devs; devs = cur; cur->ino = statbuf.st_ino; cur->fsid = statbuf.st_dev; cur->name = filename; return(1); } void usage(void) { (void)fprintf(stderr, "usage: fstat [-fmnv] [-M core] [-N system] [-p pid] [-u user] [file ...]\n"); exit(1); }