Index: head/sys/kern/vfs_export.c =================================================================== --- head/sys/kern/vfs_export.c (revision 72955) +++ head/sys/kern/vfs_export.c (revision 72956) @@ -1,3145 +1,3150 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 * $FreeBSD$ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, never decreased. */ static unsigned long numvnodes; SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Minimum number of free vnodes. If there are fewer than this free vnodes, * getnewvnode() will return a newly allocated vnode. */ static u_long wantfreevnodes = 25; SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes = 0; SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); static int reassignbufloops; SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); static int reassignbufsortgood; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); static int reassignbufsortbad; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); #ifdef ENABLE_VFS_IOOPT /* See NOTES for a description of this setting. */ int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; /* For any iteration/modification of mnt_vnodelist */ struct mtx mntvnode_mtx; /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* For any iteration/modification of vnode_free_list */ static struct mtx vnode_free_list_mtx; /* * For any iteration/modification of dev->si_hlist (linked through * v_specnext) */ static struct mtx spechash_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static vm_zone_t vnode_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive = 0; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ static void vntblinit(void *dummy __unused) { desiredvnodes = maxproc + cnt.v_page_count / 4; mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); mtx_init(&mntid_mtx, "mntid", MTX_DEF); mtx_init(&spechash_mtx, "spechash", MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct mtx *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_iosize_max = DFLTPHYS; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(mp) struct mount *mp; { static u_int16_t mntid_base; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makeudev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if (vfs_getvfs(&tfsid) == NULL) break; } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(tsp) struct timespec *tsp; { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s, count; struct proc *p = curproc; /* XXX */ struct vnode *vp = NULL; struct mount *vnmp; vm_object_t object; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); mtx_lock(&vnode_free_list_mtx); if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else for (count = 0; count < freevnodes; count++) { vp = TAILQ_FIRST(&vnode_free_list); if (vp == NULL || vp->v_usecount) panic("getnewvnode: free vnode isn't"); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if active in the namecache or * if it still has cached pages or we cannot get * its interlock. */ if (LIST_FIRST(&vp->v_cache_src) != NULL || (VOP_GETVOBJECT(vp, &object) == 0 && (object->resident_page_count || object->ref_count)) || !mtx_trylock(&vp->v_interlock)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; continue; } /* * Skip over it if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) break; mtx_unlock(&vp->v_interlock); TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; } if (vp) { vp->v_flag |= VDOOMED; vp->v_flag &= ~VFREE; freevnodes--; mtx_unlock(&vnode_free_list_mtx); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { mtx_unlock(&vp->v_interlock); } vn_finished_write(vnmp); #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); if (vp->v_writecount != 0) panic("Non-zero write count"); } #endif vp->v_flag = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; } else { mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); vp->v_dd = vp; mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { mtx_lock(&mntvnode_mtx); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { mtx_unlock(&mntvnode_mtx); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects * buffers to reside on a queue, while VOP_BWRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; BUF_WRITE(bp); } } else { bremfree(bp); (void) BUF_WRITE(bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &object) == 0) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } mtx_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } BUF_WRITE(bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; bp->b_dev = vn_todev(vp); /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= BX_VNCLEAN; bp->b_xflags &= ~BX_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; struct mount *mp; long starttime; int s; struct proc *p = updateproc; mtx_lock(&Giant); EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_LAST); for (;;) { kthread_suspend_check(p); starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp, NULL) == 0 && vn_start_write(vp, &mp, V_NOWAIT) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); vn_finished_write(mp); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && !vn_isdisk(vp, NULL)) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ #ifdef SOFTUPDATES softdep_process_worklist(NULL); #endif /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { mtx_lock_spin(&sched_lock); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); mtx_unlock_spin(&sched_lock); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; bp->b_dev = vn_todev(vp); } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); /* XXX REMOVE ME */ if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } /* * Change the vnode a pager buffer is associated with. */ void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { KASSERT(bp->b_flags & B_PAGING, ("pbreassignbuf() on non phys bp %p", bp)); bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } ++reassignbufcalls; /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: if (newvp->v_rdev->si_mountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= BX_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || bp->b_lblkno == 0 || (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (bp->b_lblkno < 0) { TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (reassignbufmethod == 1) { /* * New sorting algorithm, only handle sequential case, * otherwise append to end (but before metadata) */ if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && (tbp->b_xflags & BX_VNDIRTY)) { /* * Found the best place to insert the buffer */ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortgood; } else { /* * Missed, append to end, but before meta-data. * We know that the head buffer in the list is * not meta-data due to prior conditionals. * * Indirect effects: NFS second stage write * tends to wind up here, giving maximum * distance between the unstable write and the * commit rpc. */ tbp = TAILQ_LAST(listheadp, buflists); while (tbp && tbp->b_lblkno < 0) tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortbad; } } else { /* * Old sorting algorithm, scan queue and insert */ struct buf *ttbp; while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && (ttbp->b_lblkno < bp->b_lblkno)) { ++reassignbufloops; tbp = ttbp; } TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= BX_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } if (vfinddev(dev, VCHR, vpp)) return (0); error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; vp->v_type = VCHR; addalias(vp, dev); *vpp = vp; return (0); } /* * Add vnode to the alias list hung off the dev_t. * * The reason for this gunk is that multiple vnodes can reference * the same physical device, so checking vp->v_usecount to see * how many users there are is inadequate; the v_usecount for * the vnodes need to be accumulated. vcount() does that. */ struct vnode * addaliasu(nvp, nvp_rdev) struct vnode *nvp; udev_t nvp_rdev; { struct vnode *ovp; vop_t **ops; dev_t dev; if (nvp->v_type == VBLK) return (nvp); if (nvp->v_type != VCHR) panic("addaliasu on non-special vnode"); dev = udev2dev(nvp_rdev, 0); /* * Check to see if we have a bdevvp vnode with no associated * filesystem. If so, we want to associate the filesystem of * the new newly instigated vnode with the bdevvp vnode and * discard the newly created vnode rather than leaving the * bdevvp vnode lying around with no associated filesystem. */ if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { addalias(nvp, dev); return (nvp); } /* * Discard unneeded vnode, but save its node specific data. * Note that if there is a lock, it is carried over in the * node specific data to the replacement vnode. */ vref(ovp); ovp->v_data = nvp->v_data; ovp->v_tag = nvp->v_tag; nvp->v_data = NULL; lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); if (nvp->v_vnlock) ovp->v_vnlock = &ovp->v_lock; ops = ovp->v_op; ovp->v_op = nvp->v_op; if (VOP_ISLOCKED(nvp, curproc)) { VOP_UNLOCK(nvp, 0, curproc); vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc); } nvp->v_op = ops; insmntque(ovp, nvp->v_mount); vrele(nvp); vgone(nvp); return (ovp); } /* This is a local helper function that do the same as addaliasu, but for a * dev_t instead of an udev_t. */ static void addalias(nvp, dev) struct vnode *nvp; dev_t dev; { KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); nvp->v_rdev = dev; mtx_lock(&spechash_mtx); SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); mtx_unlock(&spechash_mtx); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) mtx_lock(&vp->v_interlock); if (vp->v_flag & VXLOCK) { if (vp->v_vxproc == curproc) { printf("VXLOCK interlock avoided\n"); } else { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vget", 0); return (ENOENT); } } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ mtx_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); mtx_unlock(&vp->v_interlock); } return (error); } mtx_unlock(&vp->v_interlock); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { mtx_lock(&vp->v_interlock); vp->v_usecount++; mtx_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; mtx_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); mtx_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() aquires the lock internally.) */ void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ mtx_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; mtx_lock(&mntvnode_mtx); loop: for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = LIST_NEXT(vp, v_mntvnodes); /* * Skip over a selected vnode. */ if (vp == skipvp) continue; mtx_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { mtx_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { mtx_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { mtx_unlock(&mntvnode_mtx); vgonel(vp, p); mtx_lock(&mntvnode_mtx); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { mtx_unlock(&mntvnode_mtx); if (vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } mtx_lock(&mntvnode_mtx); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif mtx_unlock(&vp->v_interlock); busy++; } mtx_unlock(&mntvnode_mtx); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; vp->v_vxproc = curproc; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ if (flags & DOCLOSE) { if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) (void) vn_write_suspend_wait(vp, NULL, V_WAIT); if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) vinvalbuf(vp, 0, NOCRED, p, 0, 0); } VOP_DESTROYVOBJECT(vp); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) { /* * Inline copy of vrele() since VOP_INACTIVE * has already been called. */ mtx_lock(&vp->v_interlock); if (--vp->v_usecount <= 0) { #ifdef DIAGNOSTIC if (vp->v_usecount < 0 || vp->v_writecount != 0) { vprint("vclean: bad ref count", vp); panic("vclean: ref cnt"); } #endif vfree(vp); } mtx_unlock(&vp->v_interlock); } cache_purge(vp); vp->v_vnlock = NULL; lockdestroy(&vp->v_lock); if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; vp->v_vxproc = NULL; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; dev_t dev; KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vop_revokeall", 0); return (0); } dev = vp->v_rdev; for (;;) { mtx_lock(&spechash_mtx); vq = SLIST_FIRST(&dev->si_hlist); mtx_unlock(&spechash_mtx); if (!vq) break; vgone(vq); } return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct mtx *inter_lkp; struct proc *p; { mtx_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { mtx_unlock(inter_lkp); } vgonel(vp, p); return (1); } mtx_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ mtx_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); mtx_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { mtx_lock(&spechash_mtx); SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); freedev(vp->v_rdev); mtx_unlock(&spechash_mtx); vp->v_rdev = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the * VDOOMED flag and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); mtx_lock(&vnode_free_list_mtx); if (vp->v_flag & VFREE) TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); mtx_unlock(&vnode_free_list_mtx); splx(s); } vp->v_type = VBAD; mtx_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { struct vnode *vp; mtx_lock(&spechash_mtx); SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { if (type == vp->v_type) { *vpp = vp; mtx_unlock(&spechash_mtx); return (1); } } mtx_unlock(&spechash_mtx); return (0); } /* * Calculate the total number of references to a special device. */ int vcount(vp) struct vnode *vp; { struct vnode *vq; int count; count = 0; mtx_lock(&spechash_mtx); SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) count += vq->v_usecount; mtx_unlock(&spechash_mtx); return (count); } /* * Same as above, but using the dev_t as argument */ int count_dev(dev) dev_t dev; { struct vnode *vp; vp = SLIST_FIRST(&dev->si_hlist); if (vp == NULL) return (0); return(vcount(vp)); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { if (VOP_ISLOCKED(vp, NULL)) vprint((char *)0, vp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif /* XXX the below code does not compile; vfs_sysctl does not exist. */ #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if COMPILING_LINT #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } again: mtx_lock(&mntvnode_mtx); for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { mtx_unlock(&mntvnode_mtx); goto again; } nvp = LIST_NEXT(vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); mtx_lock(&mntvnode_mtx); } mtx_unlock(&mntvnode_mtx); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); return (0); } /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { if (vp->v_rdev->si_mountpoint != NULL) return (EBUSY); return (0); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, p); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } else { /* The unmount has removed mp from the mountlist */ } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* Helper for vfs_free_addrlist. */ /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } /* * High level function to manipulate export options on a mount point * and the passed in netexport. * Struct export_args *argp is the variable used to twiddle options, * the structure is described in sys/mount.h */ int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } /* * Used by the filesystems to determine if a given network address * (passed in 'nam') is present in thier exports list, returns a pointer * to struct netcred so that the filesystem can examine it for * access rights (read/write/etc). */ struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { nvp = LIST_NEXT(vp, v_mntvnodes); if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { if (VOP_GETVOBJECT(vp, &obj) != 0 || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp, NULL)) continue; } mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &obj) == 0 && (obj->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (VOP_GETVOBJECT(vp, &obj) == 0) { vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); anyio = 1; } vput(vp); } } else { mtx_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { return (VOP_CREATEVOBJECT(vp, cred, p)); } /* * Mark a vnode as free, putting it up for recycling. */ void vfree(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } /* * Opposite of vfree() - mark a vnode as in use. */ void vbusy(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); mtx_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } +#define VN_KNOTE(vp, b) \ + KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) + /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { mtx_lock(&vp->v_pollinfo.vpi_lock); + VN_KNOTE(vp, NOTE_REVOKE); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp, p); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vn_finished_write(mp); vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } /* * extract the dev_t from a VCHR */ dev_t vn_todev(vp) struct vnode *vp; { if (vp->v_type != VCHR) return (NODEV); return (vp->v_rdev); } /* * Check if vnode represents a disk device */ int vn_isdisk(vp, errp) struct vnode *vp; int *errp; { struct cdevsw *cdevsw; if (vp->v_type != VCHR) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (vp->v_rdev == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } cdevsw = devsw(vp->v_rdev); if (cdevsw == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } if (!(cdevsw->d_flags & D_DISK)) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (errp != NULL) *errp = 0; return (1); } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE(ndp, flags) struct nameidata *ndp; const uint flags; { if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Common file system object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request. Returns 0 on success, or an errno on failure. */ int vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) enum vtype type; mode_t file_mode; uid_t file_uid; gid_t file_gid; mode_t acc_mode; struct ucred *cred; int *privused; { mode_t dac_granted; #ifdef CAPABILITIES mode_t cap_granted; #endif /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); privcheck: if (!suser_xxx(cred, NULL, PRISON_ROOT)) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #ifdef CAPABILITIES /* * Build a capability mask to determine if the set of capabilities * satisfies the requirements when combined with the granted mask * from above. * For each capability, if the capability is required, bitwise * or the request type onto the cap_granted mask. */ cap_granted = 0; if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #endif return ((acc_mode & VADMIN) ? EPERM : EACCES); } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 72955) +++ head/sys/kern/vfs_subr.c (revision 72956) @@ -1,3145 +1,3150 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 * $FreeBSD$ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, never decreased. */ static unsigned long numvnodes; SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Minimum number of free vnodes. If there are fewer than this free vnodes, * getnewvnode() will return a newly allocated vnode. */ static u_long wantfreevnodes = 25; SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes = 0; SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); static int reassignbufloops; SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); static int reassignbufsortgood; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); static int reassignbufsortbad; SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ static int reassignbufmethod = 1; SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); #ifdef ENABLE_VFS_IOOPT /* See NOTES for a description of this setting. */ int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; /* For any iteration/modification of mnt_vnodelist */ struct mtx mntvnode_mtx; /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* For any iteration/modification of vnode_free_list */ static struct mtx vnode_free_list_mtx; /* * For any iteration/modification of dev->si_hlist (linked through * v_specnext) */ static struct mtx spechash_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static vm_zone_t vnode_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive = 0; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ static void vntblinit(void *dummy __unused) { desiredvnodes = maxproc + cnt.v_page_count / 4; mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); mtx_init(&mntid_mtx, "mntid", MTX_DEF); mtx_init(&spechash_mtx, "spechash", MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct mtx *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_iosize_max = DFLTPHYS; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(mp) struct mount *mp; { static u_int16_t mntid_base; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makeudev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if (vfs_getvfs(&tfsid) == NULL) break; } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(tsp) struct timespec *tsp; { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s, count; struct proc *p = curproc; /* XXX */ struct vnode *vp = NULL; struct mount *vnmp; vm_object_t object; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); mtx_lock(&vnode_free_list_mtx); if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else for (count = 0; count < freevnodes; count++) { vp = TAILQ_FIRST(&vnode_free_list); if (vp == NULL || vp->v_usecount) panic("getnewvnode: free vnode isn't"); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if active in the namecache or * if it still has cached pages or we cannot get * its interlock. */ if (LIST_FIRST(&vp->v_cache_src) != NULL || (VOP_GETVOBJECT(vp, &object) == 0 && (object->resident_page_count || object->ref_count)) || !mtx_trylock(&vp->v_interlock)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; continue; } /* * Skip over it if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) break; mtx_unlock(&vp->v_interlock); TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); vp = NULL; } if (vp) { vp->v_flag |= VDOOMED; vp->v_flag &= ~VFREE; freevnodes--; mtx_unlock(&vnode_free_list_mtx); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { mtx_unlock(&vp->v_interlock); } vn_finished_write(vnmp); #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); if (vp->v_writecount != 0) panic("Non-zero write count"); } #endif vp->v_flag = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; } else { mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); vp->v_dd = vp; mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { mtx_lock(&mntvnode_mtx); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { mtx_unlock(&mntvnode_mtx); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects * buffers to reside on a queue, while VOP_BWRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; BUF_WRITE(bp); } } else { bremfree(bp); (void) BUF_WRITE(bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &object) == 0) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } mtx_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } BUF_WRITE(bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; bp->b_dev = vn_todev(vp); /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= BX_VNCLEAN; bp->b_xflags &= ~BX_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; struct mount *mp; long starttime; int s; struct proc *p = updateproc; mtx_lock(&Giant); EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_LAST); for (;;) { kthread_suspend_check(p); starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp, NULL) == 0 && vn_start_write(vp, &mp, V_NOWAIT) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); vn_finished_write(mp); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && !vn_isdisk(vp, NULL)) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ #ifdef SOFTUPDATES softdep_process_worklist(NULL); #endif /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { mtx_lock_spin(&sched_lock); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); mtx_unlock_spin(&sched_lock); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; bp->b_dev = vn_todev(vp); } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); /* XXX REMOVE ME */ if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } /* * Change the vnode a pager buffer is associated with. */ void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { KASSERT(bp->b_flags & B_PAGING, ("pbreassignbuf() on non phys bp %p", bp)); bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } ++reassignbufcalls; /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { if (bp->b_xflags & BX_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: if (newvp->v_rdev->si_mountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= BX_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || bp->b_lblkno == 0 || (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (bp->b_lblkno < 0) { TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); ++reassignbufsortgood; } else if (reassignbufmethod == 1) { /* * New sorting algorithm, only handle sequential case, * otherwise append to end (but before metadata) */ if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && (tbp->b_xflags & BX_VNDIRTY)) { /* * Found the best place to insert the buffer */ TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortgood; } else { /* * Missed, append to end, but before meta-data. * We know that the head buffer in the list is * not meta-data due to prior conditionals. * * Indirect effects: NFS second stage write * tends to wind up here, giving maximum * distance between the unstable write and the * commit rpc. */ tbp = TAILQ_LAST(listheadp, buflists); while (tbp && tbp->b_lblkno < 0) tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); ++reassignbufsortbad; } } else { /* * Old sorting algorithm, scan queue and insert */ struct buf *ttbp; while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && (ttbp->b_lblkno < bp->b_lblkno)) { ++reassignbufloops; tbp = ttbp; } TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= BX_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } if (vfinddev(dev, VCHR, vpp)) return (0); error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; vp->v_type = VCHR; addalias(vp, dev); *vpp = vp; return (0); } /* * Add vnode to the alias list hung off the dev_t. * * The reason for this gunk is that multiple vnodes can reference * the same physical device, so checking vp->v_usecount to see * how many users there are is inadequate; the v_usecount for * the vnodes need to be accumulated. vcount() does that. */ struct vnode * addaliasu(nvp, nvp_rdev) struct vnode *nvp; udev_t nvp_rdev; { struct vnode *ovp; vop_t **ops; dev_t dev; if (nvp->v_type == VBLK) return (nvp); if (nvp->v_type != VCHR) panic("addaliasu on non-special vnode"); dev = udev2dev(nvp_rdev, 0); /* * Check to see if we have a bdevvp vnode with no associated * filesystem. If so, we want to associate the filesystem of * the new newly instigated vnode with the bdevvp vnode and * discard the newly created vnode rather than leaving the * bdevvp vnode lying around with no associated filesystem. */ if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { addalias(nvp, dev); return (nvp); } /* * Discard unneeded vnode, but save its node specific data. * Note that if there is a lock, it is carried over in the * node specific data to the replacement vnode. */ vref(ovp); ovp->v_data = nvp->v_data; ovp->v_tag = nvp->v_tag; nvp->v_data = NULL; lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); if (nvp->v_vnlock) ovp->v_vnlock = &ovp->v_lock; ops = ovp->v_op; ovp->v_op = nvp->v_op; if (VOP_ISLOCKED(nvp, curproc)) { VOP_UNLOCK(nvp, 0, curproc); vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc); } nvp->v_op = ops; insmntque(ovp, nvp->v_mount); vrele(nvp); vgone(nvp); return (ovp); } /* This is a local helper function that do the same as addaliasu, but for a * dev_t instead of an udev_t. */ static void addalias(nvp, dev) struct vnode *nvp; dev_t dev; { KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); nvp->v_rdev = dev; mtx_lock(&spechash_mtx); SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); mtx_unlock(&spechash_mtx); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set if the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) mtx_lock(&vp->v_interlock); if (vp->v_flag & VXLOCK) { if (vp->v_vxproc == curproc) { printf("VXLOCK interlock avoided\n"); } else { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vget", 0); return (ENOENT); } } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ mtx_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); mtx_unlock(&vp->v_interlock); } return (error); } mtx_unlock(&vp->v_interlock); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { mtx_lock(&vp->v_interlock); vp->v_usecount++; mtx_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; mtx_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); mtx_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() aquires the lock internally.) */ void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); mtx_lock(&vp->v_interlock); KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ mtx_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; mtx_lock(&mntvnode_mtx); loop: for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = LIST_NEXT(vp, v_mntvnodes); /* * Skip over a selected vnode. */ if (vp == skipvp) continue; mtx_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { mtx_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { mtx_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { mtx_unlock(&mntvnode_mtx); vgonel(vp, p); mtx_lock(&mntvnode_mtx); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { mtx_unlock(&mntvnode_mtx); if (vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } mtx_lock(&mntvnode_mtx); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif mtx_unlock(&vp->v_interlock); busy++; } mtx_unlock(&mntvnode_mtx); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; vp->v_vxproc = curproc; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ if (flags & DOCLOSE) { if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) (void) vn_write_suspend_wait(vp, NULL, V_WAIT); if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) vinvalbuf(vp, 0, NOCRED, p, 0, 0); } VOP_DESTROYVOBJECT(vp); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) { /* * Inline copy of vrele() since VOP_INACTIVE * has already been called. */ mtx_lock(&vp->v_interlock); if (--vp->v_usecount <= 0) { #ifdef DIAGNOSTIC if (vp->v_usecount < 0 || vp->v_writecount != 0) { vprint("vclean: bad ref count", vp); panic("vclean: ref cnt"); } #endif vfree(vp); } mtx_unlock(&vp->v_interlock); } cache_purge(vp); vp->v_vnlock = NULL; lockdestroy(&vp->v_lock); if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; vp->v_vxproc = NULL; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; dev_t dev; KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vop_revokeall", 0); return (0); } dev = vp->v_rdev; for (;;) { mtx_lock(&spechash_mtx); vq = SLIST_FIRST(&dev->si_hlist); mtx_unlock(&spechash_mtx); if (!vq) break; vgone(vq); } return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct mtx *inter_lkp; struct proc *p; { mtx_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { mtx_unlock(inter_lkp); } vgonel(vp, p); return (1); } mtx_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ mtx_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); mtx_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { mtx_lock(&spechash_mtx); SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); freedev(vp->v_rdev); mtx_unlock(&spechash_mtx); vp->v_rdev = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the * VDOOMED flag and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); mtx_lock(&vnode_free_list_mtx); if (vp->v_flag & VFREE) TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); mtx_unlock(&vnode_free_list_mtx); splx(s); } vp->v_type = VBAD; mtx_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { struct vnode *vp; mtx_lock(&spechash_mtx); SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { if (type == vp->v_type) { *vpp = vp; mtx_unlock(&spechash_mtx); return (1); } } mtx_unlock(&spechash_mtx); return (0); } /* * Calculate the total number of references to a special device. */ int vcount(vp) struct vnode *vp; { struct vnode *vq; int count; count = 0; mtx_lock(&spechash_mtx); SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) count += vq->v_usecount; mtx_unlock(&spechash_mtx); return (count); } /* * Same as above, but using the dev_t as argument */ int count_dev(dev) dev_t dev; { struct vnode *vp; vp = SLIST_FIRST(&dev->si_hlist); if (vp == NULL) return (0); return(vcount(vp)); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { if (VOP_ISLOCKED(vp, NULL)) vprint((char *)0, vp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif /* XXX the below code does not compile; vfs_sysctl does not exist. */ #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if COMPILING_LINT #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } again: mtx_lock(&mntvnode_mtx); for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { mtx_unlock(&mntvnode_mtx); goto again; } nvp = LIST_NEXT(vp, v_mntvnodes); mtx_unlock(&mntvnode_mtx); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); mtx_lock(&mntvnode_mtx); } mtx_unlock(&mntvnode_mtx); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, p); } mtx_unlock(&mountlist_mtx); return (0); } /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { if (vp->v_rdev->si_mountpoint != NULL) return (EBUSY); return (0); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, p); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } else { /* The unmount has removed mp from the mountlist */ } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; bzero(&np->netc_anon, sizeof(np->netc_anon)); np->netc_anon.cr_uid = argp->ex_anon.cr_uid; np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* Helper for vfs_free_addrlist. */ /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } /* * High level function to manipulate export options on a mount point * and the passed in netexport. * Struct export_args *argp is the variable used to twiddle options, * the structure is described in sys/mount.h */ int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } /* * Used by the filesystems to determine if a given network address * (passed in 'nam') is present in thier exports list, returns a pointer * to struct netcred so that the filesystem can examine it for * access rights (read/write/etc). */ struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { nvp = LIST_NEXT(vp, v_mntvnodes); if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { if (VOP_GETVOBJECT(vp, &obj) != 0 || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp, NULL)) continue; } mtx_lock(&vp->v_interlock); if (VOP_GETVOBJECT(vp, &obj) == 0 && (obj->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (VOP_GETVOBJECT(vp, &obj) == 0) { vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); anyio = 1; } vput(vp); } } else { mtx_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { return (VOP_CREATEVOBJECT(vp, cred, p)); } /* * Mark a vnode as free, putting it up for recycling. */ void vfree(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } /* * Opposite of vfree() - mark a vnode as in use. */ void vbusy(vp) struct vnode *vp; { int s; s = splbio(); mtx_lock(&vnode_free_list_mtx); KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; mtx_unlock(&vnode_free_list_mtx); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); mtx_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { mtx_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } +#define VN_KNOTE(vp, b) \ + KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) + /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { mtx_lock(&vp->v_pollinfo.vpi_lock); + VN_KNOTE(vp, NOTE_REVOKE); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } mtx_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp, p); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vn_finished_write(mp); vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } /* * extract the dev_t from a VCHR */ dev_t vn_todev(vp) struct vnode *vp; { if (vp->v_type != VCHR) return (NODEV); return (vp->v_rdev); } /* * Check if vnode represents a disk device */ int vn_isdisk(vp, errp) struct vnode *vp; int *errp; { struct cdevsw *cdevsw; if (vp->v_type != VCHR) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (vp->v_rdev == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } cdevsw = devsw(vp->v_rdev); if (cdevsw == NULL) { if (errp != NULL) *errp = ENXIO; return (0); } if (!(cdevsw->d_flags & D_DISK)) { if (errp != NULL) *errp = ENOTBLK; return (0); } if (errp != NULL) *errp = 0; return (1); } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE(ndp, flags) struct nameidata *ndp; const uint flags; { if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Common file system object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request. Returns 0 on success, or an errno on failure. */ int vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) enum vtype type; mode_t file_mode; uid_t file_uid; gid_t file_gid; mode_t acc_mode; struct ucred *cred; int *privused; { mode_t dac_granted; #ifdef CAPABILITIES mode_t cap_granted; #endif /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= VWRITE; if ((acc_mode & dac_granted) == acc_mode) return (0); privcheck: if (!suser_xxx(cred, NULL, PRISON_ROOT)) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #ifdef CAPABILITIES /* * Build a capability mask to determine if the set of capabilities * satisfies the requirements when combined with the granted mask * from above. * For each capability, if the capability is required, bitwise * or the request type onto the cap_granted mask. */ cap_granted = 0; if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) cap_granted |= VEXEC; if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) cap_granted |= VREAD; if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) cap_granted |= VWRITE; if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) cap_granted |= VADMIN; if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } #endif return ((acc_mode & VADMIN) ? EPERM : EACCES); } Index: head/sys/sys/event.h =================================================================== --- head/sys/sys/event.h (revision 72955) +++ head/sys/sys/event.h (revision 72956) @@ -1,166 +1,167 @@ /*- * Copyright (c) 1999,2000 Jonathan Lemon * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EVENT_H_ #define _SYS_EVENT_H_ #define EVFILT_READ (-1) #define EVFILT_WRITE (-2) #define EVFILT_AIO (-3) /* attached to aio requests */ #define EVFILT_VNODE (-4) /* attached to vnodes */ #define EVFILT_PROC (-5) /* attached to struct proc */ #define EVFILT_SIGNAL (-6) /* attached to struct proc */ #define EVFILT_SYSCOUNT 6 struct kevent { uintptr_t ident; /* identifier for this event */ short filter; /* filter for event */ u_short flags; u_int fflags; intptr_t data; void *udata; /* opaque user data identifier */ }; /* actions */ #define EV_ADD 0x0001 /* add event to kq (implies enable) */ #define EV_DELETE 0x0002 /* delete event from kq */ #define EV_ENABLE 0x0004 /* enable event */ #define EV_DISABLE 0x0008 /* disable event (not reported) */ /* flags */ #define EV_ONESHOT 0x0010 /* only report one occurrence */ #define EV_CLEAR 0x0020 /* clear event state after reporting */ #define EV_SYSFLAGS 0xF000 /* reserved by system */ #define EV_FLAG1 0x2000 /* filter-specific flag */ /* returned values */ #define EV_EOF 0x8000 /* EOF detected */ #define EV_ERROR 0x4000 /* error, data contains errno */ /* * data/hint flags for EVFILT_VNODE, shared with userspace */ #define NOTE_DELETE 0x0001 /* vnode was removed */ #define NOTE_WRITE 0x0002 /* data contents changed */ #define NOTE_EXTEND 0x0004 /* size increased */ #define NOTE_ATTRIB 0x0008 /* attributes changed */ #define NOTE_LINK 0x0010 /* link count changed */ #define NOTE_RENAME 0x0020 /* vnode was renamed */ +#define NOTE_REVOKE 0x0040 /* vnode access was revoked */ /* * data/hint flags for EVFILT_PROC, shared with userspace */ #define NOTE_EXIT 0x80000000 /* process exited */ #define NOTE_FORK 0x40000000 /* process forked */ #define NOTE_EXEC 0x20000000 /* process exec'd */ #define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid */ /* additional flags for EVFILT_PROC */ #define NOTE_TRACK 0x00000001 /* follow across forks */ #define NOTE_TRACKERR 0x00000002 /* could not track child */ #define NOTE_CHILD 0x00000004 /* am a child process */ /* * This is currently visible to userland to work around broken * programs which pull in or . */ #include struct knote; SLIST_HEAD(klist, knote); #ifdef _KERNEL #define KNOTE(list, hint) if ((list) != NULL) knote(list, hint) /* * Flag indicating hint is a signal. Used by EVFILT_SIGNAL, and also * shared by EVFILT_PROC (all knotes attached to p->p_klist) */ #define NOTE_SIGNAL 0x08000000 struct filterops { int f_isfd; /* true if ident == filedescriptor */ int (*f_attach) __P((struct knote *kn)); void (*f_detach) __P((struct knote *kn)); int (*f_event) __P((struct knote *kn, long hint)); }; struct knote { SLIST_ENTRY(knote) kn_link; /* for fd */ SLIST_ENTRY(knote) kn_selnext; /* for struct selinfo */ TAILQ_ENTRY(knote) kn_tqe; struct kqueue *kn_kq; /* which queue we are on */ struct kevent kn_kevent; int kn_status; int kn_sfflags; /* saved filter flags */ intptr_t kn_sdata; /* saved data field */ union { struct file *p_fp; /* file data pointer */ struct proc *p_proc; /* proc pointer */ } kn_ptr; struct filterops *kn_fop; caddr_t kn_hook; #define KN_ACTIVE 0x01 /* event has been triggered */ #define KN_QUEUED 0x02 /* event is on queue */ #define KN_DISABLED 0x04 /* event is disabled */ #define KN_DETACHED 0x08 /* knote is detached */ #define kn_id kn_kevent.ident #define kn_filter kn_kevent.filter #define kn_flags kn_kevent.flags #define kn_fflags kn_kevent.fflags #define kn_data kn_kevent.data #define kn_fp kn_ptr.p_fp }; struct proc; extern void knote(struct klist *list, long hint); extern void knote_remove(struct proc *p, struct klist *list); extern void knote_fdclose(struct proc *p, int fd); extern int kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p); #else /* !_KERNEL */ #include struct timespec; __BEGIN_DECLS int kqueue __P((void)); int kevent __P((int kq, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout)); __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_EVENT_H_ */ Index: head/sys/ufs/ufs/ufs_vnops.c =================================================================== --- head/sys/ufs/ufs/ufs_vnops.c (revision 72955) +++ head/sys/ufs/ufs/ufs_vnops.c (revision 72956) @@ -1,2366 +1,2379 @@ /* * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 * $FreeBSD$ */ #include "opt_quota.h" #include "opt_suiddir.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include static int ufs_access __P((struct vop_access_args *)); static int ufs_advlock __P((struct vop_advlock_args *)); static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *)); static int ufs_chown __P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *)); static int ufs_close __P((struct vop_close_args *)); static int ufs_create __P((struct vop_create_args *)); static int ufs_getattr __P((struct vop_getattr_args *)); static int ufs_link __P((struct vop_link_args *)); static int ufs_makeinode __P((int mode, struct vnode *, struct vnode **, struct componentname *)); static int ufs_missingop __P((struct vop_generic_args *ap)); static int ufs_mkdir __P((struct vop_mkdir_args *)); static int ufs_mknod __P((struct vop_mknod_args *)); static int ufs_open __P((struct vop_open_args *)); static int ufs_pathconf __P((struct vop_pathconf_args *)); static int ufs_print __P((struct vop_print_args *)); static int ufs_readdir __P((struct vop_readdir_args *)); static int ufs_readlink __P((struct vop_readlink_args *)); static int ufs_remove __P((struct vop_remove_args *)); static int ufs_rename __P((struct vop_rename_args *)); static int ufs_rmdir __P((struct vop_rmdir_args *)); static int ufs_setattr __P((struct vop_setattr_args *)); static int ufs_strategy __P((struct vop_strategy_args *)); static int ufs_symlink __P((struct vop_symlink_args *)); static int ufs_whiteout __P((struct vop_whiteout_args *)); static int ufsfifo_close __P((struct vop_close_args *)); static int ufsfifo_read __P((struct vop_read_args *)); static int ufsfifo_write __P((struct vop_write_args *)); static int ufsspec_close __P((struct vop_close_args *)); static int ufsspec_read __P((struct vop_read_args *)); static int ufsspec_write __P((struct vop_write_args *)); static int filt_ufsread __P((struct knote *kn, long hint)); static int filt_ufsvnode __P((struct knote *kn, long hint)); static void filt_ufsdetach __P((struct knote *kn)); static int ufs_kqfilter __P((struct vop_kqfilter_args *ap)); union _qcvt { int64_t qcvt; int32_t val[2]; }; #define SETHIGH(q, h) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_HIGHWORD] = (h); \ (q) = tmp.qcvt; \ } #define SETLOW(q, l) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_LOWWORD] = (l); \ (q) = tmp.qcvt; \ } #define VN_KNOTE(vp, b) \ KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, (b)) /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; void ufs_itimes(vp) struct vnode *vp; { struct inode *ip; struct timespec ts; ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { vfs_timestamp(&ts); if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if (ip->i_flag & IN_ACCESS) { ip->i_atime = ts.tv_sec; ip->i_atimensec = ts.tv_nsec; } if (ip->i_flag & IN_UPDATE) { ip->i_mtime = ts.tv_sec; ip->i_mtimensec = ts.tv_nsec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { ip->i_ctime = ts.tv_sec; ip->i_ctimensec = ts.tv_nsec; } } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } /* * Create a regular file */ int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ ip->i_rdev = vap->va_rdev; } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. */ vput(*vpp); (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, vpp); if (error) { *vpp = NULL; return (error); } return (0); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ int ufs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; mtx_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); mtx_unlock(&vp->v_interlock); return (0); } int ufs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); mode_t mode = ap->a_mode; #ifdef QUOTA int error; #endif /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); #endif break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((mode & VWRITE) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); return (vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_mode, ap->a_cred, NULL)); } /* ARGSUSED */ int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); register struct vattr *vap = ap->a_vap; ufs_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = VFSTOUFS(vp->v_mount)->um_i_effnlink_valid ? ip->i_effnlink : ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_din.di_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = ip->i_atimensec; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = ip->i_mtimensec; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = ip->i_ctimensec; vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Unprivileged processes and privileged processes in * jail() are not permitted to set system flags. * Privileged processes not in jail() may only set system * flags if the securelevel <= 0. */ if (!suser_xxx(cred, NULL, 0)) { if ((ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) && securelevel > 0) return (EPERM); /* Snapshot flag cannot be set or cleared */ if (((vap->va_flags & SF_SNAPSHOT) != 0 && (ip->i_flags & SF_SNAPSHOT) == 0) || ((vap->va_flags & SF_SNAPSHOT) == 0 && (ip->i_flags & SF_SNAPSHOT) != 0)) return (EPERM); ip->i_flags = vap->va_flags; } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || (vap->va_flags & UF_SETTABLE) != vap->va_flags) return (EPERM); ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); } ip->i_flag |= IN_CHANGE; if (vap->va_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); break; default: break; } if ((error = UFS_TRUNCATE(vp, vap->va_size, 0, cred, p)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, p)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.tv_sec != VNOVAL) ip->i_flag |= IN_CHANGE | IN_UPDATE; ufs_itimes(vp); if (vap->va_atime.tv_sec != VNOVAL) { ip->i_atime = vap->va_atime.tv_sec; ip->i_atimensec = vap->va_atime.tv_nsec; } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_mtime = vap->va_mtime.tv_sec; ip->i_mtimensec = vap->va_mtime.tv_nsec; } error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) return (EPERM); error = ufs_chmod(vp, (int)vap->va_mode, cred, p); } VN_KNOTE(vp, NOTE_ATTRIB); return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, p) register struct vnode *vp; register int mode; register struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (suser_xxx(cred, NULL, PRISON_ROOT)) { if (vp->v_type != VDIR && (mode & S_ISTXT)) return (EFTYPE); if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) return (EPERM); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, p) register struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA register int i; long change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * To change the owner of a file, or change the group of a file * to a group of which we are not a member, the caller must * have privilege. */ if ((uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) && (error = suser_xxx(cred, p, PRISON_ROOT))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = ip->i_blocks; (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; ip->i_uid = uid; #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; ip->i_uid = ouid; if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ ip->i_flag |= IN_CHANGE; if (suser_xxx(cred, NULL, PRISON_ROOT) && (ouid != uid || ogid != gid)) ip->i_mode &= ~(ISUID | ISGID); return (0); } int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); VN_KNOTE(vp, NOTE_DELETE); VN_KNOTE(dvp, NOTE_WRITE); out: return (error); } /* * link vnode call */ int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct proc *p = cnp->cn_proc; struct inode *ip; struct direct newdir; int error; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (tdvp->v_mount != vp->v_mount) { error = EXDEV; goto out2; } if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { goto out2; } ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; goto out1; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out1; } ip->i_effnlink++; ip->i_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); } if (error) { ip->i_effnlink--; ip->i_nlink--; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); } out1: if (tdvp != vp) VOP_UNLOCK(vp, 0, p); out2: VN_KNOTE(vp, NOTE_LINK); VN_KNOTE(tdvp, NOTE_WRITE); return (error); } /* * whiteout vnode call */ int ufs_whiteout(ap) struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (dvp->v_mount->mnt_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_whiteout: missing name"); if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif newdir.d_ino = WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); break; case DELETE: /* remove an existing directory whiteout */ #ifdef DIAGNOSTIC if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } return (error); } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; register struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct proc *p = fcnp->cn_proc; struct inode *ip, *xp, *dp; struct direct newdir; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0, ioflag; #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Check if just deleting a link name or if we've lost a race. * If another process completes the same rename after we've looked * up the source and have blocked looking up the target, then the * source and target inodes may be identical now although the * names were never linked. */ if (fvp == tvp) { if (fvp->v_type == VDIR) { /* * Linked directories are impossible, so we must * have lost the race. Pretend that the rename * completed before the lookup. */ #ifdef UFS_RENAME_DEBUG printf("ufs_rename: fvp == tvp for directories\n"); #endif error = ENOENT; goto abortit; } /* Release destination completely. */ vput(tdvp); vput(tvp); /* * Delete source. There is another race now that everything * is unlocked, but this doesn't cause any new complications. * Relookup() may find a file that is unrelated to the * original one, or it may fail. Too bad. */ vrele(fdvp); vrele(fvp); fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); fcnp->cn_nameiop = DELETE; VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp == NULL) { #ifdef UFS_RENAME_DEBUG printf("ufs_rename: from name disappeared\n"); #endif return (ENOENT); } error = VOP_REMOVE(fdvp, fvp, fcnp); if (fdvp == fvp) vrele(fdvp); else vput(fdvp); vput(fvp); return (error); } if ((error = vn_lock(fvp, LK_EXCLUSIVE, p)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= LINK_MAX) { VOP_UNLOCK(fvp, 0, p); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp, 0, p); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp, 0, p); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory = 1; } VN_KNOTE(fdvp, NOTE_WRITE); /* XXX right place? */ vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ip->i_effnlink++; ip->i_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip); if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)))) != 0) { VOP_UNLOCK(fvp, 0, p); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); VOP_UNLOCK(fvp, 0, p); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ufs_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } dp->i_effnlink++; dp->i_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | DOINGASYNC(tdvp))); if (error) goto bad; } ufs_makedirentry(ip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); if (error) { if (doingdirectory && newparent) { dp->i_effnlink--; dp->i_nlink--; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); (void)UFS_UPDATE(tdvp, 1); } goto bad; } VN_KNOTE(tdvp, NOTE_WRITE); vput(tdvp); } else { if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller * must possess VADMIN for the parent directory, or the * destination of the rename. This implements append-only * directories. */ if ((dp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, p) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, p)) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode&IFMT) == IFDIR) { if ((xp->i_effnlink > 2) || !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ufs_dirrewrite(dp, xp, ip->i_number, IFTODT(ip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) goto bad; if (doingdirectory) { if (!newparent) { dp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(dp); } xp->i_effnlink--; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(xp); } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * Truncate inode. The only stuff left in the directory * is "." and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; } xp->i_nlink--; xp->i_flag |= IN_CHANGE; ioflag = DOINGASYNC(tvp) ? 0 : IO_SYNC; if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_proc)) != 0) goto bad; } VN_KNOTE(tdvp, NOTE_WRITE); vput(tdvp); VN_KNOTE(tvp, NOTE_DELETE); vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. */ if (doingdirectory) panic("ufs_rename: lost dir entry"); vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; the IN_RENAME * flag ensures that it cannot be moved by another rename or removed * by a rmdir. */ if (xp != ip) { if (doingdirectory) panic("ufs_rename: lost dir entry"); } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { xp->i_offset = mastertemplate.dot_reclen; ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); xp->i_flag &= ~IN_RENAME; } VN_KNOTE(fvp, NOTE_RENAME); if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE, p) == 0) { ip->i_effnlink--; ip->i_nlink--; ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; if (DOINGSOFTDEP(fvp)) softdep_change_linkcnt(ip); vput(fvp); } else vrele(fvp); return (error); } /* * Mkdir system call */ int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; int error, dmode; long blkoff; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_gid = dp->i_gid; #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp); error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); if (error) goto bad; /* * Initialize directory with "." and ".." from static template. */ if (dvp->v_mount->mnt_maxsymlinklen > 0 ) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; if ((error = VOP_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, B_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE | IN_UPDATE; vnode_pager_setsize(tvp, (u_long)ip->i_size); bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp)))) != 0) { (void)BUF_WRITE(bp); goto bad; } /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = BUF_WRITE(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); bad: if (error == 0) { VN_KNOTE(dvp, NOTE_WRITE); *ap->a_vpp = tvp; } else { dp->i_effnlink--; dp->i_nlink--; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); vput(tvp); } out: return (error); } /* * Rmdir system call. */ int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error, ioflag; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if (ip->i_flag & IN_RENAME) { error = EINVAL; goto out; } if (ip->i_effnlink != 2 || !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ dp->i_effnlink--; ip->i_effnlink--; if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp); softdep_change_linkcnt(ip); } error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; if (DOINGSOFTDEP(vp)) { softdep_change_linkcnt(dp); softdep_change_linkcnt(ip); } goto out; } VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); cache_purge(dvp); /* * Truncate inode. The only stuff left in the directory is "." and * "..". The "." reference is inconsequential since we are quashing * it. The soft dependency code will arrange to do these operations * after the parent directory entry has been deleted on disk, so * when running with that code we avoid doing them now. */ if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; ip->i_nlink--; ip->i_flag |= IN_CHANGE; ioflag = DOINGASYNC(vp) ? 0 : IO_SYNC; error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, cnp->cn_proc); } cache_purge(vp); out: VN_KNOTE(vp, NOTE_DELETE); return (error); } /* * symlink -- make a symbolic link */ int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *vp, **vpp = ap->a_vpp; register struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); VN_KNOTE(ap->a_dvp, NOTE_WRITE); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0, (struct proc *)0); if (error) vput(vp); return (error); } /* * Vnode op for reading directories. * * The routine below assumes that the on-disk format of a directory * is the same as that defined by . If the on-disk * format changes, then it will be necessary to do a conversion * from the on-disk format that read returns to the format defined * by . */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; } */ *ap; { register struct uio *uio = ap->a_uio; int error; size_t count, lost; off_t off; if (ap->a_ncookies != NULL) /* * Ensure that the block is aligned. The caller can use * the cookies to determine where in the block to start. */ uio->uio_offset &= ~(DIRBLKSIZ - 1); off = uio->uio_offset; count = uio->uio_resid; /* Make sure we don't return partial entries. */ if (count <= ((uio->uio_offset + count) & (DIRBLKSIZ -1))) return (EINVAL); count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); lost = uio->uio_resid - count; uio->uio_resid = count; uio->uio_iov->iov_len = count; # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); } else { struct dirent *dp, *edp; struct uio auio; struct iovec aiov; caddr_t dirbuf; int readcnt; u_char tmp; auio = *uio; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; aiov.iov_len = count; MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { tmp = dp->d_namlen; dp->d_namlen = dp->d_type; dp->d_type = tmp; if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, uio); } FREE(dirbuf, M_TEMP); } # else error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); # endif if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dpEnd; struct dirent* dp; int ncookies; u_long *cookies; u_long *cookiep; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ufs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) (uio->uio_iov->iov_base - (uio->uio_offset - off)); dpEnd = (struct dirent *) uio->uio_iov->iov_base; for (dp = dpStart, ncookies = 0; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) ncookies++; MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, M_WAITOK); for (dp = dpStart, cookiep = cookies; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { off += dp->d_reclen; *cookiep++ = (u_long) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } uio->uio_resid += lost; if (ap->a_eofflag) *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if ((isize < vp->v_mount->mnt_maxsymlinklen) || (ip->i_din.di_blocks == 0)) { /* XXX - for old fastlink support */ uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the VOP_BMAP operation may not * deadlock on memory. See ufs_bmap() for details. */ int ufs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = ap->a_vp; register struct inode *ip; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } vp = ip->i_devvp; bp->b_dev = vp->v_rdev; VOP_STRATEGY(vp, bp); return (0); } /* * Print out the contents of an inode. */ int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); printf("tag VT_UFS, ino %lu, on dev %s (%d, %d)", (u_long)ip->i_number, devtoname(ip->i_dev), major(ip->i_dev), minor(ip->i_dev)); if (vp->v_type == VFIFO) fifo_printinfo(vp); lockmgr_printinfo(&vp->v_lock); printf("\n"); return (0); } /* * Read wrapper for special devices. */ int ufsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap); /* * The inode may have been revoked during the call, so it must not * be accessed blindly here or in the other wrapper functions. */ ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) ip->i_flag |= IN_ACCESS; return (error); } /* * Write wrapper for special devices. */ int ufsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap); ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Close wrapper for special devices. * * Update the times on the inode then do device close. */ int ufsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; mtx_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); mtx_unlock(&vp->v_interlock); return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifos. */ int ufsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap); ip = VTOI(ap->a_vp); if ((ap->a_vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (error); } /* * Write wrapper for fifos. */ int ufsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { int error, resid; struct inode *ip; struct uio *uio; uio = ap->a_uio; resid = uio->uio_resid; error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap); ip = VTOI(ap->a_vp); if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (error); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; mtx_lock(&vp->v_interlock); if (vp->v_usecount > 1) ufs_itimes(vp); mtx_unlock(&vp->v_interlock); return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Advisory record locking support */ int ufs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct inode *ip = VTOI(ap->a_vp); return (lf_advlock(ap, &(ip->i_lockf), ip->i_size)); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, specops, fifoops, vpp) struct mount *mntp; vop_t **specops; vop_t **fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp; struct timeval tv; vp = *vpp; ip = VTOI(vp); switch(vp->v_type = IFTOVT(ip->i_mode)) { case VCHR: case VBLK: vp->v_op = specops; vp = addaliasu(vp, ip->i_rdev); ip->i_vnode = vp; break; case VFIFO: vp->v_op = fifoops; break; default: break; } if (ip->i_number == ROOTINO) vp->v_flag |= VROOT; /* * Initialize modrev times */ getmicrouptime(&tv); SETHIGH(ip->i_modrev, tv.tv_sec); SETLOW(ip->i_modrev, tv.tv_usec * 4294); *vpp = vp; return (0); } /* * Allocate a new inode. */ int ufs_makeinode(mode, dvp, vpp, cnp) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { register struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; ucp = cnp->cn_cred; #endif /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ ucred.cr_ref = 1; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && suser_xxx(cnp->cn_cred, NULL, PRISON_ROOT)) ip->i_mode &= ~ISGID; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp))); if (error) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_effnlink = 0; ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); vput(tvp); return (error); } static int ufs_missingop(ap) struct vop_generic_args *ap; { panic("no vop function for %s in ufs child", ap->a_desc->vdesc_name); return (EOPNOTSUPP); } static struct filterops ufsread_filtops = { 1, NULL, filt_ufsdetach, filt_ufsread }; static struct filterops ufsvnode_filtops = { 1, NULL, filt_ufsdetach, filt_ufsvnode }; static int ufs_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &ufsread_filtops; break; case EVFILT_VNODE: kn->kn_fop = &ufsvnode_filtops; break; default: return (1); } kn->kn_hook = (caddr_t)vp; mtx_lock(&vp->v_pollinfo.vpi_lock); SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext); mtx_unlock(&vp->v_pollinfo.vpi_lock); return (0); } static void filt_ufsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; mtx_lock(&vp->v_pollinfo.vpi_lock); SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note, kn, knote, kn_selnext); mtx_unlock(&vp->v_pollinfo.vpi_lock); } /*ARGSUSED*/ static int filt_ufsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct inode *ip = VTOI(vp); + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + if (hint == NOTE_REVOKE) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return (1); + } + kn->kn_data = ip->i_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } static int filt_ufsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; + if (hint == NOTE_REVOKE) { + kn->kn_flags |= EV_EOF; + return (1); + } return (kn->kn_fflags != 0); } /* Global vfs data structures for ufs. */ static vop_t **ufs_vnodeop_p; static struct vnodeopv_entry_desc ufs_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_read_desc, (vop_t *) ufs_missingop }, { &vop_reallocblks_desc, (vop_t *) ufs_missingop }, { &vop_write_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_advlock_desc, (vop_t *) ufs_advlock }, { &vop_bmap_desc, (vop_t *) ufs_bmap }, { &vop_cachedlookup_desc, (vop_t *) ufs_lookup }, { &vop_close_desc, (vop_t *) ufs_close }, { &vop_create_desc, (vop_t *) ufs_create }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_link_desc, (vop_t *) ufs_link }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, { &vop_mkdir_desc, (vop_t *) ufs_mkdir }, { &vop_mknod_desc, (vop_t *) ufs_mknod }, { &vop_open_desc, (vop_t *) ufs_open }, { &vop_pathconf_desc, (vop_t *) ufs_pathconf }, { &vop_poll_desc, (vop_t *) vop_stdpoll }, { &vop_kqfilter_desc, (vop_t *) ufs_kqfilter }, { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_readdir_desc, (vop_t *) ufs_readdir }, { &vop_readlink_desc, (vop_t *) ufs_readlink }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_remove_desc, (vop_t *) ufs_remove }, { &vop_rename_desc, (vop_t *) ufs_rename }, { &vop_rmdir_desc, (vop_t *) ufs_rmdir }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_strategy_desc, (vop_t *) ufs_strategy }, { &vop_symlink_desc, (vop_t *) ufs_symlink }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_whiteout_desc, (vop_t *) ufs_whiteout }, { NULL, NULL } }; static struct vnodeopv_desc ufs_vnodeop_opv_desc = { &ufs_vnodeop_p, ufs_vnodeop_entries }; static vop_t **ufs_specop_p; static struct vnodeopv_entry_desc ufs_specop_entries[] = { { &vop_default_desc, (vop_t *) spec_vnoperate }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_close_desc, (vop_t *) ufsspec_close }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_read_desc, (vop_t *) ufsspec_read }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) ufsspec_write }, { NULL, NULL } }; static struct vnodeopv_desc ufs_specop_opv_desc = { &ufs_specop_p, ufs_specop_entries }; static vop_t **ufs_fifoop_p; static struct vnodeopv_entry_desc ufs_fifoop_entries[] = { { &vop_default_desc, (vop_t *) fifo_vnoperate }, { &vop_fsync_desc, (vop_t *) ufs_missingop }, { &vop_access_desc, (vop_t *) ufs_access }, { &vop_close_desc, (vop_t *) ufsfifo_close }, { &vop_getattr_desc, (vop_t *) ufs_getattr }, { &vop_inactive_desc, (vop_t *) ufs_inactive }, { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_stdlock }, { &vop_print_desc, (vop_t *) ufs_print }, { &vop_read_desc, (vop_t *) ufsfifo_read }, { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, { &vop_setattr_desc, (vop_t *) ufs_setattr }, { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) ufsfifo_write }, { NULL, NULL } }; static struct vnodeopv_desc ufs_fifoop_opv_desc = { &ufs_fifoop_p, ufs_fifoop_entries }; VNODEOP_SET(ufs_vnodeop_opv_desc); VNODEOP_SET(ufs_specop_opv_desc); VNODEOP_SET(ufs_fifoop_opv_desc); int ufs_vnoperate(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_vnodeop_p, ap->a_desc->vdesc_offset, ap)); } int ufs_vnoperatefifo(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_fifoop_p, ap->a_desc->vdesc_offset, ap)); } int ufs_vnoperatespec(ap) struct vop_generic_args /* { struct vnodeop_desc *a_desc; } */ *ap; { return (VOCALL(ufs_specop_p, ap->a_desc->vdesc_offset, ap)); }