diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 32093c00f542..16373b129aac 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,4197 +1,4223 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #define WI_MPSAFEQ 0 #define WI_GIANTQ 1 static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure"); static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void vbusy(struct vnode *vp); static void vinactive(struct vnode *, struct thread *); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vfree(struct vnode *); static void vnlru_free(int); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static int vfs_knllocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); /* * Enable Giant pushdown based on whether or not the vm is mpsafe in this * build. Without mpsafevm the buffer cache can not run Giant free. */ int mpsafe_vfs = 1; TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, "MPSAFE VFS"); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed * vnode. */ static unsigned long numvnodes; SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* Set to 1 to print out reclaim of active vnodes */ int prtactive; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending[2]; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* * Macros to control when a vnode is freed and recycled. All require * the vnode interlock. */ #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) /* * Initialize the vnode management data structures. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX 100000 #endif static void vntblinit(void *dummy __unused) { /* * Desiredvnodes is a function of the physical memory size and * the kernel's heap size. Specifically, desiredvnodes scales * in proportion to the physical memory size until two fifths * of the kernel's heap size is consumed by vnodes and vm * objects. */ desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Initialize the filesystem syncer. */ syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); MNT_ILOCK(mp); MNT_REF(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0); MNT_REL(mp); MNT_IUNLOCK(mp); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_lockref--; if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } +/* + * Lookup a mount point by filesystem identifier, busying it before + * returning. + */ +struct mount * +vfs_busyfs(fsid_t *fsid) +{ + struct mount *mp; + int error; + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + error = vfs_busy(mp, MBF_MNTLSTLOCK); + if (error) { + mtx_unlock(&mountlist_mtx); + return (NULL); + } + return (mp); + } + } + mtx_unlock(&mountlist_mtx); + return ((struct mount *) 0); +} + /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; /* * If the thread is jailed, but this is not a jail-friendly file * system, deny immediately. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted outside a jail and a jailed thread * tries to access it, deny immediately. */ if (!jailed(mp->mnt_cred) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted inside different jail that the jail of * the calling thread, deny immediately. */ if (jailed(mp->mnt_cred) && jailed(td->td_ucred) && mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) { return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static u_int16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_SEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, ""); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); vgonel(vp); VOP_UNLOCK(vp, 0); vdropl(vp); done++; next_iter_mntunlocked: if ((count % 256) != 0) goto relock_mnt; goto yield; next_iter: if ((count % 256) != 0) continue; MNT_IUNLOCK(mp); yield: uio_yield(); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; int vfslocked; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); continue; } VNASSERT(VCANRECYCLE(vp), vp, ("vp inconsistent on freelist")); freevnodes--; vp->v_iflag &= ~VI_FREE; vholdl(vp); mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vtryrecycle(vp); VFS_UNLOCK_GIANT(vfslocked); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done; struct proc *p = vnlruproc; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); mtx_lock(&Giant); for (;;) { kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { int vfsunlocked; if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if (!VFS_NEEDSGIANT(mp)) { mtx_unlock(&Giant); vfsunlocked = 1; } else vfsunlocked = 0; done += vlrureclaim(mp); if (vfsunlocked) mtx_lock(&Giant); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (done == 0) { EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10); #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else uio_yield(); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ void vdestroy(struct vnode *vp) { struct bufobj *bo; CTR1(KTR_VFS, "vdestroy vp %p", vp); mtx_lock(&vnode_free_list_mtx); numvnodes--; mtx_unlock(&vnode_free_list_mtx); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) destroy_vpollinfo(vp->v_pollinfo); #ifdef INVARIANTS /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); uma_zfree(vnode_zone, vp); } /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) return (EWOULDBLOCK); /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) vgonel(vp); VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp); return (0); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp = NULL; struct bufobj *bo; mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); /* * Wait for available vnodes. */ if (numvnodes > desiredvnodes) { if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); goto alloc; } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (numvnodes > desiredvnodes) { mtx_unlock(&vnode_free_list_mtx); return (ENFILE); } #endif } alloc: numvnodes++; mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF); bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = 0; #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); else if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp); *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; /* XXX non mp-safe fs may still call insmntque with vnode unlocked */ if (!VOP_ISLOCKED(vp)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { int locked; KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); #ifdef DEBUG_VFS_LOCKS if (!VFS_NEEDSGIANT(mp)) ASSERT_VOP_ELOCKED(vp, "insmntque: mp-safe fs and non-locked vp"); #endif MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) { locked = VOP_ISLOCKED(vp); if (!locked || (locked == LK_EXCLUSIVE && (vp->v_vflag & VV_FORCEINSMQ) == 0)) { MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_UNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL) { VM_OBJECT_LOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); VM_OBJECT_UNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_LOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: bo = &vp->v_bufobj; BO_LOCK(bo); anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } BO_LOCK(bo); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } BO_LOCK(bo); } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * buf_splay() - splay tree core for the clean/dirty list of buffers in * a vnode. * * NOTE: We have to deal with the special case of a background bitmap * buffer, a situation where two buffers will have the same logical * block offset. We want (1) only the foreground buffer to be accessed * in a lookup and (2) must differentiate between the foreground and * background buffer in the splay tree algorithm because the splay * tree cannot normally handle multiple entities with the same 'index'. * We accomplish this by adding differentiating flags to the splay tree's * numerical domain. */ static struct buf * buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) { struct buf dummy; struct buf *lefttreemax, *righttreemin, *y; if (root == NULL) return (NULL); lefttreemax = righttreemin = &dummy; for (;;) { if (lblkno < root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_left) == NULL) break; if (lblkno < y->b_lblkno) { /* Rotate right. */ root->b_left = y->b_right; y->b_right = root; root = y; if ((y = root->b_left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->b_left = root; righttreemin = root; } else if (lblkno > root->b_lblkno || (lblkno == root->b_lblkno && (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { if ((y = root->b_right) == NULL) break; if (lblkno > y->b_lblkno) { /* Rotate left. */ root->b_right = y->b_left; y->b_left = root; root = y; if ((y = root->b_right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->b_right = root; lefttreemax = root; } else { break; } root = y; } /* Assemble the new root. */ lefttreemax->b_right = root->b_left; righttreemin->b_left = root->b_right; root->b_left = dummy.b_right; root->b_right = dummy.b_left; return (root); } static void buf_vlist_remove(struct buf *bp) { struct buf *root; struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_LOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; if (bp != bv->bv_root) { root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); KASSERT(root == bp, ("splay lookup failed in remove")); } if (bp->b_left == NULL) { root = bp->b_right; } else { root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); root->b_right = bp->b_right; } bv->bv_root = root; TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list using a * splay tree algorithm. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct buf *root; struct bufv *bv; ASSERT_BO_LOCKED(bo); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); if (root == NULL) { bp->b_left = NULL; bp->b_right = NULL; TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); } else if (bp->b_lblkno < root->b_lblkno || (bp->b_lblkno == root->b_lblkno && (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { bp->b_left = root->b_left; bp->b_right = root; root->b_left = NULL; TAILQ_INSERT_BEFORE(root, bp, b_bobufs); } else { bp->b_right = root->b_right; bp->b_left = root; root->b_right = NULL; TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); } bv->bv_cnt++; bv->bv_root = bp; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); if ((bp = bo->bo_clean.bv_root) != NULL) { bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } if ((bp = bo->bo_dirty.bv_root) != NULL) { bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) return (bp); } return (NULL); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_LOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT) bp->b_flags |= B_NEEDSGIANT; bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_flags &= ~B_NEEDSGIANT; bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int queue, slot; ASSERT_BO_LOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ : WI_MPSAFEQ; LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = (*bo)->__bo_vnode; /* XXX */ if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *gnext, *next; struct synclist *gslp, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int first_printf; int error; last_work_seen = 0; syncer_final_iter = 0; first_printf = 1; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && LIST_EMPTY(gslp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } } if (!LIST_EMPTY(gslp)) { mtx_unlock(&sync_mtx); mtx_lock(&Giant); mtx_lock(&sync_mtx); while (!LIST_EMPTY(gslp)) { error = sync_vnode(gslp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(gnext, bo, bo_synclist); continue; } } mtx_unlock(&Giant); } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } vholdl(vp); } /* * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. */ static void v_upgrade_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n", vp, vp->v_holdcnt, vp->v_usecount); ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VI_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. */ int vget(struct vnode *vp, int flags, struct thread *td) { int error; error = 0; VFS_ASSERT_GIANT(vp->v_mount); VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vget: invalid lock operation")); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); vholdl(vp); if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { vdrop(vp); return (error); } if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); VI_LOCK(vp); /* Upgrade our holdcnt to a usecount. */ v_upgrade_usecount(vp); /* * We don't guarantee that any particular close will * trigger inactive processing so just make a best effort * here at preventing a reference to a removed file. If * we don't succeed no harm is done. */ if (vp->v_iflag & VI_OWEINACT) { if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && (flags & LK_NOWAIT) == 0) vinactive(vp, td); vp->v_iflag &= ~VI_OWEINACT; } VI_UNLOCK(vp); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { struct thread *td = curthread; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vrele: missed vn_close")); if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); #endif VI_UNLOCK(vp); panic("vrele: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0); } else { VI_LOCK(vp); if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; } vdropl(vp); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) */ void vput(struct vnode *vp) { struct thread *td = curthread; /* XXX */ int error; KASSERT(vp != NULL, ("vput: null vp")); ASSERT_VOP_LOCKED(vp, "vput"); VFS_ASSERT_GIANT(vp->v_mount); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vput: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { VOP_UNLOCK(vp, 0); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); vp->v_iflag |= VI_OWEINACT; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE|LK_INTERLOCK|LK_NOWAIT); VI_LOCK(vp); if (error) { if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; goto done; } } if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (vp->v_iflag & VI_OWEINACT) vinactive(vp, td); VOP_UNLOCK(vp, 0); done: vdropl(vp); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } void vholdl(struct vnode *vp) { vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); } /* * Note that there is one less who cares about this vnode. vdrop() is the * opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we will free it if it has been vgone'd otherwise it is * placed on the free list. */ void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, "vdropl"); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; if (vp->v_holdcnt == 0) { if (vp->v_iflag & VI_DOOMED) { vdestroy(vp); return; } else vfree(vp); } VI_UNLOCK(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ static void vinactive(struct vnode *vp, struct thread *td) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush( struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR1(KTR_VFS, "vflush: mp %p", mp); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0) return (error); vput(rootvp); } MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); vholdl(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0); vdrop(vp); MNT_ILOCK(mp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0); vdropl(vp); MNT_ILOCK(mp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0); vdropl(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp, 0); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) return (EBUSY); for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp, struct thread *td) { int recycled; ASSERT_VOP_ELOCKED(vp, "vrecycle"); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; struct mount *mp; CTR1(KTR_VFS, "vgonel: vp %p", vp); ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) vinvalbuf(vp, 0, 0, 0); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Clear the advisory locks and wake up waiting threads. */ lf_purgelocks(vp, &(vp->v_lockf)); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_TEXT) strlcat(buf, "|VV_TEXT", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_AGE) strlcat(buf, "|VI_AGE", sizeof(buf)); if (vp->v_iflag & VI_DOOMED) strlcat(buf, "|VI_DOOMED", sizeof(buf)); if (vp->v_iflag & VI_FREE) strlcat(buf, "|VI_FREE", sizeof(buf)); if (vp->v_iflag & VI_OBJDIRTY) strlcat(buf, "|VI_OBJDIRTY", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count); printf(" "); lockmgr_printinfo(vp->v_vnlock); printf("\n"); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp, *nmp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vprint("", vp); } nmp = TAILQ_NEXT(mp, mnt_list); } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct statfs *sp; struct vnode *vp; char buf[512]; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; flags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOINSMNTQ); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_MPSAFE); MNT_KERN_FLAG(MNTK_NOKNOTE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (mp->mnt_cred->cr_prison != NULL) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d\n", mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); db_printf(" mnt_noasync = %u\n", mp->mnt_noasync); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf("\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static void vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) { strcpy(xvfsp->vfc_name, vfsp->vfc_name); xvfsp->vfc_typenum = vfsp->vfc_typenum; xvfsp->vfc_refcount = vfsp->vfc_refcount; xvfsp->vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp->vfc_vfsops = NULL; xvfsp->vfc_next = NULL; } /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; struct xvfsconf xvfsp; int error; error = 0; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); if (error) break; } return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; struct xvfsconf xvfsp; printf("WARNING: userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); bzero(&xvfsp, sizeof(xvfsp)); vfsconf2x(vfsp, &xvfsp); return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp; struct thread *td; int error; KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OBJDIRTY) && (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { MNT_IUNLOCK(mp); if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); MNT_ILOCK(mp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_UNLOCK(obj); } vput(vp); } MNT_ILOCK(mp); } else VI_UNLOCK(vp); } MNT_IUNLOCK(mp); } /* * Mark a vnode as free, putting it up for recycling. */ static void vfree(struct vnode *vp) { CTR1(KTR_VFS, "vfree vp %p", vp); ASSERT_VI_LOCKED(vp, "vfree"); mtx_lock(&vnode_free_list_mtx); VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, ("vfree: Freeing doomed vnode")); if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); } /* * Opposite of vfree() - mark a vnode as in use. */ static void vbusy(struct vnode *vp) { CTR1(KTR_VFS, "vbusy vp %p", vp); ASSERT_VI_LOCKED(vp, "vbusy"); VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed.")); mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); mtx_unlock(&vnode_free_list_mtx); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knllocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp, 0); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; mtx_unlock(&sync_mtx); BO_UNLOCK(bo); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ mtx_lock(&mountlist_mtx); if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { mtx_unlock(&mountlist_mtx); return (0); } if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY, ap->a_td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } BO_UNLOCK(bo); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; error = 0; dev_lock(); if (vp->v_type != VCHR) error = ENOTBLK; else if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. * * The ifdef'd CAPABILITIES version is here for reference, but is not * actually used. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused) { accmode_t dac_granted; accmode_t priv_granted; /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp != a->a_fvp) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } #endif } void vop_lookup_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); #endif } void vop_lookup_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lookup_args *a; struct vnode *dvp; struct vnode *vp; a = ap; dvp = a->a_dvp; vp = *(a->a_vpp); ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); if (!rc) ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init(&fs_knlist, NULL, NULL, NULL, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { 1, NULL, filt_vfsdetach, filt_vfsread }; static struct filterops vfswrite_filtops = { 1, NULL, filt_vfsdetach, filt_vfswrite }; static struct filterops vfsvnode_filtops = { 1, NULL, filt_vfsdetach, filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0); } static int vfs_knllocked(void *arg) { struct vnode *vp = arg; return (VOP_ISLOCKED(vp) == LK_EXCLUSIVE); } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); kn->kn_data = va.va_size - kn->kn_fp->f_offset; return (kn->kn_data != 0); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_fflags != 0); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VA_MARK_ATIME. This functionality is used by execve * and mmap, so we want to avoid the synchronous I/O implied by * directly setting va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct ucred *cred) { struct vattr atimeattr; if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) { VATTR_NULL(&atimeattr); atimeattr.va_vaflags |= VA_MARK_ATIME; (void)VOP_SETATTR(vp, &atimeattr, cred); } } diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index ad71a89eee92..e4f4f8ba3ffc 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4650 +1,4650 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include SDT_PROVIDER_DEFINE(vfs); SDT_PROBE_DEFINE(vfs, , stat, mode); SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *"); SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int"); SDT_PROBE_DEFINE(vfs, , stat, reg); SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *"); SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int"); static int chroot_refuse_vdir_fds(struct filedesc *fdp); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); static int setfmode(struct thread *td, struct vnode *, int); static int setfflags(struct thread *td, struct vnode *, int); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); /* * The module initialization routine for POSIX asynchronous I/O will * set this to the version of AIO that it implements. (Zero means * that it is not implemented.) This value is used here by pathconf() * and in kern_descrip.c by fpathconf(). */ int async_io_version; #ifdef DEBUG static int syncprt = 0; SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int vfslocked; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); } VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { char *path; int cmd; int uid; caddr_t arg; } */ *uap; { struct mount *mp; int vfslocked; int error; struct nameidata nd; AUDIT_ARG(cmd, uap->cmd); AUDIT_ARG(uid, uap->uid); if (jailed(td->td_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; if ((error = vfs_busy(mp, 0))) { vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } vrele(nd.ni_vp); error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td); vfs_unbusy(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int statfs(td, uap) struct thread *td; register struct statfs_args /* { char *path; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct statfs *sp, sb; int vfslocked; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); mp = nd.ni_vp->v_mount; vfs_ref(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("statfs(%d): %s: %d\n", vfslocked, path, error); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { int fd; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct statfs *sp, sb; int vfslocked; struct vnode *vp; int error; AUDIT_ARG(fd, fd); error = getvnode(td->td_proc->p_fd, fd, &fp); if (error) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef AUDIT AUDIT_ARG(vnode, vp, ARG_VNODE1); #endif mp = vp->v_mount; if (mp) vfs_ref(mp); VOP_UNLOCK(vp, 0); fdrop(fp, td); if (vp->v_iflag & VI_DOOMED) { error = EBADF; goto out; } #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: if (mp) vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { struct statfs *buf; long bufsize; int flags; } */ *uap; { return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE, uap->flags)); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, enum uio_seg bufseg, int flags) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, sb; size_t count, maxcount; int vfslocked; int error; maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) sfsp = NULL; else if (bufseg == UIO_USERSPACE) sfsp = *buf; else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP, M_WAITOK); } count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* * Set these in case the underlying filesystem * fails to do so. */ sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || (flags & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); continue; } if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } if (bufseg == UIO_SYSSPACE) bcopy(sp, sfsp, sizeof(*sp)); else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } } sfsp++; } VFS_UNLOCK_GIANT(vfslocked); count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(td, uap) struct thread *td; struct freebsd4_statfs_args /* { char *path; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(td, uap) struct thread *td; struct freebsd4_fstatfs_args /* { int fd; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int flags; }; #endif int freebsd4_getfsstat(td, uap) struct thread *td; register struct freebsd4_getfsstat_args /* { struct ostatfs *buf; long bufsize; int flags; } */ *uap; { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags); if (size > 0) { count = td->td_retval[0]; sp = buf; while (count > 0 && error == 0) { cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_TEMP); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(td, uap) struct thread *td; struct freebsd4_fhstatfs_args /* { struct fhandle *u_fhp; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void cvtstatfs(nsp, osp) struct statfs *nsp; struct ostatfs *osp; { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int fchdir(td, uap) struct thread *td; struct fchdir_args /* { int fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(fdp, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; VREF(vp); fdrop(fp, td); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, vp, ARG_VNODE1); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { int tvfslocked; if (vfs_busy(mp, 0)) continue; tvfslocked = VFS_LOCK_GIANT(mp); error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td); vfs_unbusy(mp); if (error) { VFS_UNLOCK_GIANT(tvfslocked); break; } vput(vp); VFS_UNLOCK_GIANT(vfslocked); vp = tdp; vfslocked = tvfslocked; } if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); FILEDESC_XLOCK(fdp); vpold = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vpold->v_mount); vrele(vpold); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int chdir(td, uap) struct thread *td; struct chdir_args /* { char *path; } */ *uap; { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; struct vnode *vp; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp, 0); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); FILEDESC_XLOCK(fdp); vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int chroot(td, uap) struct thread *td; struct chroot_args /* { char *path; } */ *uap; { int error; struct nameidata nd; int vfslocked; error = priv_check(td, PRIV_VFS_CHROOT); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) goto error; vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) goto e_vunlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp))) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp, 0); error = change_root(nd.ni_vp, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); e_vunlock: vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error: NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(vp, td) struct vnode *vp; struct thread *td; { int error; ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error) return (error); #endif error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); return (error); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking priv_check() and mac_vnode_check_chroot() to * authorize this operation. */ int change_root(vp, td) struct vnode *vp; struct thread *td; { struct filedesc *fdp; struct vnode *oldvp; int vfslocked; int error; VFS_ASSERT_GIANT(vp->v_mount); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error) { FILEDESC_XUNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; fdp->fd_rdir = vp; VREF(fdp->fd_rdir); if (!fdp->fd_jdir) { fdp->fd_jdir = vp; VREF(fdp->fd_jdir); } FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(oldvp->v_mount); vrele(oldvp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { char *path; int flags; int mode; } */ *uap; { return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct openat_args { int fd; char *path; int flag; int mode; }; #endif int openat(struct thread *td, struct openat_args *uap) { return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, int mode) { return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode)); } int kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, flags); AUDIT_ARG(mode, mode); /* XXX: audit dirfd */ /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR may * be specified. */ if (flags & O_EXEC) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); else flags = FFLAGS(flags); error = falloc(td, &nfp, &indx); if (error) return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT_AT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, fd, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { fdrop(fp, td); td->td_retval[0] = indx; return (0); } /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; fdrop(fp, td); return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ fdclose(fdp, fp, indx, td); fdrop(fp, td); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; fp->f_vnode = vp; /* XXX Does devfs need this? */ /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO, ("Unexpected fifo.")); fp->f_seqcount = 1; finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops); } VOP_UNLOCK(vp, 0); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; atomic_set_int(&fp->f_flag, FHASLOCK); } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_SETATTR(vp, &vat, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error) goto bad; } VFS_UNLOCK_GIANT(vfslocked); /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: VFS_UNLOCK_GIANT(vfslocked); fdclose(fdp, fp, indx, td); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { char *path; int mode; } */ *uap; { return (kern_open(td, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif int mknod(td, uap) struct thread *td; register struct mknod_args /* { char *path; int mode; int dev; } */ *uap; { return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #ifndef _SYS_SYSPROTO_H_ struct mknodat_args { int fd; char *path; mode_t mode; dev_t dev; }; #endif int mknodat(struct thread *td, struct mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode, int dev) { return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev)); } int kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int mode, int dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); AUDIT_ARG(dev, dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); break; case S_IFMT: error = priv_check(td, PRIV_VFS_MKNOD_BAD); break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; case S_IFIFO: if (dev == 0) return (kern_mkfifoat(td, fd, path, pathseg, mode)); /* FALLTHROUGH */ default: error = EINVAL; break; } if (error) return (error); restart: bwillwrite(); NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } else { VATTR_NULL(&vattr); FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { char *path; int mode; } */ *uap; { return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkfifoat_args { int fd; char *path; mode_t mode; }; #endif int mkfifoat(struct thread *td, struct mkfifoat_args *uap) { return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode) { return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode)); } int kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int link(td, uap) struct thread *td; register struct link_args /* { char *path; char *link; } */ *uap; { return (kern_link(td, uap->path, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct linkat_args { int fd1; char *path1; int fd2; char *path2; int flag; }; #endif int linkat(struct thread *td, struct linkat_args *uap) { int flag; flag = uap->flag; if (flag & ~AT_SYMLINK_FOLLOW) return (EINVAL); return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW)); } static int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error) return (error); } return (0); } int kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg) { return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW)); } int kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2, enum uio_seg segflg, int follow) { struct vnode *vp; struct mount *mp; struct nameidata nd; int vfslocked; int lvfslocked; int error; bwillwrite(); NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1, fd1, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, path2, fd2, td); if ((error = namei(&nd)) == 0) { lvfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); error = EEXIST; } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = can_hardlink(vp, td->td_ucred); if (error == 0) #ifdef MAC error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error == 0) #endif error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp, 0); vput(nd.ni_dvp); } NDFREE(&nd, NDF_ONLY_PNBUF); VFS_UNLOCK_GIANT(lvfslocked); } vrele(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int symlink(td, uap) struct thread *td; register struct symlink_args /* { char *path; char *link; } */ *uap; { return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct symlinkat_args { char *path; int fd; char *path2; }; #endif int symlinkat(struct thread *td, struct symlinkat_args *uap) { return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, UIO_USERSPACE)); } int kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg) { return (kern_symlinkat(td, path, AT_FDCWD, link, segflg)); } int kern_symlinkat(struct thread *td, char *path1, int fd, char *path2, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; char *syspath; int error; struct nameidata nd; int vfslocked; if (segflg == UIO_SYSSPACE) { syspath = path1; } else { syspath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0) goto out; } AUDIT_ARG(text, syspath); restart: bwillwrite(); NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, path2, fd, td); if ((error = namei(&nd)) != 0) goto out; vfslocked = NDHASGIANT(&nd); if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out2; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, syspath); return (error); } /* * Delete a whiteout from the filesystem. */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { char *path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int unlink(td, uap) struct thread *td; struct unlink_args /* { char *path; } */ *uap; { return (kern_unlink(td, uap->path, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct unlinkat_args { int fd; char *path; int flag; }; #endif int unlinkat(struct thread *td, struct unlinkat_args *uap) { int flag = uap->flag; int fd = uap->fd; char *path = uap->path; if (flag & ~AT_REMOVEDIR) return (EINVAL); if (flag & AT_REMOVEDIR) return (kern_rmdirat(td, fd, path, UIO_USERSPACE)); else return (kern_unlinkat(td, fd, path, UIO_USERSPACE)); } int kern_unlink(struct thread *td, char *path, enum uio_seg pathseg) { return (kern_unlinkat(td, AT_FDCWD, path, pathseg)); } int kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error == EINVAL ? EPERM : error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ *uap; { struct ucred *cred = td->td_ucred; struct file *fp; struct vnode *vp; struct vattr vattr; off_t offset; int error, noneg; int vfslocked; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { fdrop(fp, td); return (ESPIPE); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); noneg = (vp->v_type != VCHR); offset = uap->offset; switch (uap->whence) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += fp->f_offset; break; case L_XTND: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); if (error) break; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; drop: fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { int fd; long offset; int whence; } */ *uap; { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ nuap; nuap.fd = uap->fd; nuap.offset = uap->offset; nuap.whence = uap->whence; return (lseek(td, &nuap)); } #endif /* COMPAT_43 */ /* Version with the 'pad' argument */ int freebsd6_lseek(td, uap) struct thread *td; register struct freebsd6_lseek_args *uap; { struct lseek_args ouap; ouap.fd = uap->fd; ouap.offset = uap->offset; ouap.whence = uap->whence; return (lseek(td, &ouap)); } /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error; accmode_t accmode; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { accmode = 0; if (user_flags & R_OK) accmode |= VREAD; if (user_flags & W_OK) accmode |= VWRITE; if (user_flags & X_OK) accmode |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, accmode); if (error) return (error); #endif if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, accmode, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { char *path; int flags; } */ *uap; { return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags)); } #ifndef _SYS_SYSPROTO_H_ struct faccessat_args { int dirfd; char *path; int mode; int flag; } #endif int faccessat(struct thread *td, struct faccessat_args *uap) { if (uap->flag & ~AT_EACCESS) return (EINVAL); return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_access(struct thread *td, char *path, enum uio_seg pathseg, int mode) { return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, mode)); } int kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int flags, int mode) { struct ucred *cred, *tmpcred; struct vnode *vp; struct nameidata nd; int vfslocked; int error; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. */ if (!(flags & AT_EACCESS)) { cred = td->td_ucred; tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_ucred = tmpcred; } else cred = tmpcred = td->td_ucred; NDINIT_AT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) goto out1; vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; error = vn_access(vp, mode, tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); out1: if (!(flags & AT_EACCESS)) { td->td_ucred = cred; crfree(tmpcred); } return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { char *path; int flags; } */ *uap; { return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags) { return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, flags)); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(td, uap) struct thread *td; register struct ostat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(td, uap) struct thread *td; register struct olstat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif int stat(td, uap) struct thread *td; register struct stat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fstatat_args { int fd; char *path; struct stat *buf; int flag; } #endif int fstatat(struct thread *td, struct fstatat_args *uap) { struct stat sb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->buf, sizeof (sb)); return (error); } int kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp)); } int kern_statat(struct thread *td, int flag, int fd, char *path, enum uio_seg pathseg, struct stat *sbp) { struct nameidata nd; struct stat sb; int error, vfslocked; if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); NDINIT_AT(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); if (!error) { SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0); if (S_ISREG(sb.st_mode)) SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("stat(%d): %s\n", vfslocked, path); if (error) return (error); *sbp = sb; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat(&sb); #endif return (0); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int lstat(td, uap) struct thread *td; register struct lstat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } int kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg, sbp)); } /* * Implementation of the NetBSD [l]stat() functions. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { bzero(nsb, sizeof *nsb); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtimespec = sb->st_birthtimespec; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif int nstat(td, uap) struct thread *td; register struct nstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { char *path; int name; } */ *uap; { return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name)); } int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name) { struct nameidata nd; int error, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* If asynchronous I/O is available, it works for all files. */ if (name == _PC_ASYNC_IO) td->td_retval[0] = async_io_version; else error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; size_t count; }; #endif int readlink(td, uap) struct thread *td; register struct readlink_args /* { char *path; char *buf; size_t count; } */ *uap; { return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } #ifndef _SYS_SYSPROTO_H_ struct readlinkat_args { int fd; char *path; char *buf; size_t bufsize; }; #endif int readlinkat(struct thread *td, struct readlinkat_args *uap) { return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->bufsize)); } int kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg, count)); } int kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; int vfslocked; NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #endif if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); } vput(vp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_flags = flags; #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif int chflags(td, uap) struct thread *td; register struct chflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Same as chflags() but doesn't follow symlinks. */ int lchflags(td, uap) struct thread *td; register struct lchflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { int fd; int flags; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(fflags, uap->flags); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0); #endif error = setfflags(td, fp->f_vnode, uap->flags); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(td->td_ucred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int chmod(td, uap) struct thread *td; register struct chmod_args /* { char *path; int mode; } */ *uap; { return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct fchmodat_args { int dirfd; char *path; mode_t mode; int flag; } #endif int fchmodat(struct thread *td, struct fchmodat_args *uap) { int flag = uap->flag; int fd = uap->fd; char *path = uap->path; mode_t mode = uap->mode; if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag)); } int kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode) { return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0)); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { char *path; int mode; } */ *uap; { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, AT_SYMLINK_NOFOLLOW)); } int kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg, mode_t mode, int flag) { int error; struct nameidata nd; int vfslocked; int follow; AUDIT_ARG(mode, mode); follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, mode); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { int fd; int mode; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(mode, uap->mode); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0); #endif error = setfmode(td, fp->f_vnode, uap->mode); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(td->td_ucred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int chown(td, uap) struct thread *td; register struct chown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } #ifndef _SYS_SYSPROTO_H_ struct fchownat_args { int fd; const char * path; uid_t uid; gid_t gid; int flag; }; #endif int fchownat(struct thread *td, struct fchownat_args *uap) { int flag; flag = uap->flag; if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, uap->gid, uap->flag)); } int kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0)); } int kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int uid, int gid, int flag) { struct nameidata nd; int error, vfslocked, follow; AUDIT_ARG(owner, uid, gid); follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int lchown(td, uap) struct thread *td; register struct lchown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, AT_SYMLINK_NOFOLLOW)); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int fchown(td, uap) struct thread *td; register struct fchown_args /* { int fd; int uid; int gid; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(owner, uap->uid, uap->gid); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0); #endif error = setfown(td, fp->f_vnode, uap->uid, uap->gid); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tvpseg, tsp) const struct timeval *usrtvp; enum uio_seg tvpseg; struct timespec *tsp; { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, numtimes, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int numtimes; int nullflag; { int error, setbirthtime; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); setbirthtime = 0; if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int utimes(td, uap) struct thread *td; register struct utimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct futimesat_args { int fd; const char * path; const struct timeval * times; }; #endif int futimesat(struct thread *td, struct futimesat_args *uap) { return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE)); } int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg)); } int kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct nameidata nd; struct timespec ts[2]; int error, vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT_AT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; int vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int futimes(td, uap) struct thread *td; register struct futimes_args /* { int fd; struct timeval *tptr; } */ *uap; { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, fd); if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0); #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int truncate(td, uap) struct thread *td; register struct truncate_args /* { char *path; int pad; off_t length; } */ *uap; { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred); } vput(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { char *path; long length; } */ *uap; { struct truncate_args /* { char *path; int pad; off_t length; } */ nuap; nuap.path = uap->path; nuap.length = uap->length; return (truncate(td, &nuap)); } #endif /* COMPAT_43 */ /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { struct truncate_args ouap; ouap.path = uap->path; ouap.length = uap->length; return (truncate(td, &ouap)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { struct ftruncate_args ouap; ouap.fd = uap->fd; ouap.length = uap->length; return (ftruncate(td, &ouap)); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int fsync(td, uap) struct thread *td; struct fsync_args /* { int fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, vp, ARG_VNODE1); if (vp->v_object != NULL) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int rename(td, uap) struct thread *td; register struct rename_args /* { char *from; char *to; } */ *uap; { return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct renameat_args { int oldfd; char *old; int newfd; char *new; }; #endif int renameat(struct thread *td, struct renameat_args *uap) { return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, UIO_USERSPACE)); } int kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg) { return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg)); } int kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int tvfslocked; int fvfslocked; int error; bwillwrite(); #ifdef MAC NDINIT_AT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, old, oldfd, td); #else NDINIT_AT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, old, oldfd, td); #endif if ((error = namei(&fromnd)) != 0) return (error); fvfslocked = NDHASGIANT(&fromnd); tvfslocked = 0; #ifdef MAC error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd); VOP_UNLOCK(fromnd.ni_dvp, 0); if (fromnd.ni_dvp != fromnd.ni_vp) VOP_UNLOCK(fromnd.ni_vp, 0); #endif fvp = fromnd.ni_vp; if (error == 0) error = vn_start_write(fvp, &mp, V_WAIT | PCATCH); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT_AT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); vn_finished_write(mp); goto out1; } tvfslocked = NDHASGIANT(&tond); tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) { error = EINVAL; goto out; } /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (!error) { VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); VFS_UNLOCK_GIANT(fvfslocked); VFS_UNLOCK_GIANT(tvfslocked); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { char *path; int mode; } */ *uap; { return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkdirat_args { int fd; char *path; mode_t mode; }; #endif int mkdirat(struct thread *td, struct mkdirat_args *uap) { return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode) { return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode)); } int kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, path, fd, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int rmdir(td, uap) struct thread *td; struct rmdir_args /* { char *path; } */ *uap; { return (kern_rmdir(td, uap->path, UIO_USERSPACE)); } int kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg) { return (kern_rmdirat(td, AT_FDCWD, path, pathseg)); } int kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt, vfslocked; long loff; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error) { VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } #endif # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = uap->count; dirbuf = malloc(uap->count, M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = uap->count - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } free(dirbuf, M_TEMP); } if (error) { VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); error = copyout(&loff, uap->basep, sizeof(long)); fdrop(fp, td); td->td_retval[0] = uap->count - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { long base; int error; error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base); if (error) return (error); if (uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } int kern_getdirentries(struct thread *td, int fd, char *buf, u_int count, long *basep) { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; int vfslocked; long loff; int error, eofflag; AUDIT_ARG(fd, fd); if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); error = EINVAL; goto fail; } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = count; /* vn_lock(vp, LK_SHARED | LK_RETRY); */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG(vnode, vp, ARG_VNODE1); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; if (error) { VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); goto fail; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); *basep = loff; td->td_retval[0] = count - auio.uio_resid; fail: fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { int fd; char *buf; u_int count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (getdirentries(td, &ap)); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { int newmask; } */ *uap; { register struct filedesc *fdp; FILEDESC_XLOCK(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_XUNLOCK(td->td_proc->p_fd); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int revoke(td, uap) struct thread *td; register struct revoke_args /* { char *path; } */ *uap; { struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error) goto out; } if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Convert a user file descriptor to a kernel file entry. * A reference on the file entry is held upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { int error; struct file *fp; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_SLOCK(fdp); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) error = EBADF; else if (fp->f_vnode == NULL) { fp = NULL; error = EINVAL; } else { fhold(fp); error = 0; } FILEDESC_SUNLOCK(fdp); } *fpp = fp; return (error); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int lgetfh(td, uap) struct thread *td; register struct lgetfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { const struct fhandle *u_fhp; int flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, error, type; accmode_t accmode; struct file *nfp; int vfslocked; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error) return (error); fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ - mp = vfs_getvfs(&fhp.fh_fsid); + mp = vfs_busyfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); + vfs_unbusy(mp); if (error) goto out; /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } accmode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; accmode |= VWRITE; } if (fmode & FREAD) accmode |= VREAD; if (fmode & O_APPEND) accmode |= VAPPEND; #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, accmode); if (error) goto bad; #endif if (accmode) { error = VOP_ACCESS(vp, accmode, td->td_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); goto out; } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */ #ifdef MAC /* * We don't yet have fp->f_cred, so use td->td_ucred, which * should be right. */ error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp); if (error == 0) { #endif VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, td->td_ucred); #ifdef MAC } #endif vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL); if (error) goto bad; if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; nfp->f_vnode = vp; finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops); if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ fdclose(fdp, fp, indx, td); /* * release our private reference */ fdrop(fp, td); goto out; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); atomic_set_int(&fp->f_flag, FHASLOCK); } VOP_UNLOCK(vp, 0); fdrop(fp, td); vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = indx; return (0); bad: vput(vp); out: - vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { struct fhandle *u_fhp; struct stat *sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error) return (error); error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); - if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); - if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) { - vfs_rel(mp); + error = VFS_FHTOVP(mp, &fh.fh_fid, &vp); + vfs_unbusy(mp); + if (error) { VFS_UNLOCK_GIANT(vfslocked); return (error); } error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); - vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { struct fhandle *u_fhp; struct statfs *buf; } */ *uap; { struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); return (copyout(&sf, uap->buf, sizeof(sf))); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct statfs *sp; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error) return (error); - if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) + if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); error = VFS_FHTOVP(mp, &fh.fh_fid, &vp); if (error) { + vfs_unbusy(mp); VFS_UNLOCK_GIANT(vfslocked); - vfs_rel(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error == 0) *buf = *sp; out: - vfs_rel(mp); + vfs_unbusy(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c index bc6202149b49..49c6c1608dc1 100644 --- a/sys/nfsserver/nfs_srvsubs.c +++ b/sys/nfsserver/nfs_srvsubs.c @@ -1,1529 +1,1532 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfsrv_nfs_xdrneg1; u_int32_t nfsrv_rpc_call, nfsrv_rpc_vers, nfsrv_rpc_reply, nfsrv_rpc_msgdenied, nfsrv_rpc_autherr, nfsrv_rpc_mismatch, nfsrv_rpc_auth_unix, nfsrv_rpc_msgaccepted; u_int32_t nfsrv_nfs_prog, nfsrv_nfs_true, nfsrv_nfs_false; /* And other global data */ static const nfstype nfsv2_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFNON, NFCHR, NFNON }; #define vtonfsv2_type(a) txdr_unsigned(nfsv2_type[((int32_t)(a))]) #define vtonfsv3_mode(m) txdr_unsigned((m) & ALLPERMS) int nfsrv_ticks; #ifdef NFS_LEGACYRPC struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; #endif static int nfssvc_offset = SYS_nfssvc; static struct sysent nfssvc_prev_sysent; MAKE_SYSENT(nfssvc); struct mtx nfsd_mtx; /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ const int nfsrv_nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, }; /* * and the reverse mapping from generic to Version 2 procedure numbers */ const int nfsrvv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; /* * Maps errno values to nfs error numbers. * Use 0 (which gets converted to NFSERR_IO) as the catch all for ones not * specifically defined in RFC 1094. */ static const u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, 0, 0, 0, NFSERR_NXIO, 0, 0, 0, 0, 0, 0, NFSERR_ACCES, 0, 0, 0, NFSERR_EXIST, 0, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, 0, 0, 0, 0, 0, NFSERR_FBIG, NFSERR_NOSPC, 0, NFSERR_ROFS, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NFSERR_NAMETOL, 0, 0, NFSERR_NOTEMPTY, 0, 0, NFSERR_DQUOT, NFSERR_STALE, 0 }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static const short nfsv3err_null[] = { 0, 0, }; static const short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static const short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static const short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; /* * Called once to initialize data structures... */ static int nfsrv_modevent(module_t mod, int type, void *data) { static int registered; int error = 0; switch (type) { case MOD_LOAD: mtx_init(&nfsd_mtx, "nfsd_mtx", NULL, MTX_DEF); nfsrv_rpc_vers = txdr_unsigned(RPC_VER2); nfsrv_rpc_call = txdr_unsigned(RPC_CALL); nfsrv_rpc_reply = txdr_unsigned(RPC_REPLY); nfsrv_rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); nfsrv_rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); nfsrv_rpc_mismatch = txdr_unsigned(RPC_MISMATCH); nfsrv_rpc_autherr = txdr_unsigned(RPC_AUTHERR); nfsrv_rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); nfsrv_nfs_prog = txdr_unsigned(NFS_PROG); nfsrv_nfs_true = txdr_unsigned(TRUE); nfsrv_nfs_false = txdr_unsigned(FALSE); nfsrv_nfs_xdrneg1 = txdr_unsigned(-1); nfsrv_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfsrv_ticks < 1) nfsrv_ticks = 1; #ifdef NFS_LEGACYRPC nfsrv_initcache(); /* Init the server request cache */ NFSD_LOCK(); nfsrv_init(0); /* Init server data structures */ callout_init(&nfsrv_callout, CALLOUT_MPSAFE); NFSD_UNLOCK(); nfsrv_timer(0); #else NFSD_LOCK(); nfsrv_init(0); /* Init server data structures */ NFSD_UNLOCK(); #endif error = syscall_register(&nfssvc_offset, &nfssvc_sysent, &nfssvc_prev_sysent); if (error) break; registered = 1; break; case MOD_UNLOAD: if (nfsrv_numnfsd != 0) { error = EBUSY; break; } if (registered) syscall_deregister(&nfssvc_offset, &nfssvc_prev_sysent); callout_drain(&nfsrv_callout); #ifdef NFS_LEGACYRPC nfsrv_destroycache(); /* Free the server request cache */ #endif mtx_destroy(&nfsd_mtx); break; default: error = EOPNOTSUPP; break; } return error; } static moduledata_t nfsserver_mod = { "nfsserver", nfsrv_modevent, NULL, }; DECLARE_MODULE(nfsserver, nfsserver_mod, SI_SUB_VFS, SI_ORDER_ANY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfsserver, 1); #ifndef NFS_LEGACYRPC MODULE_DEPEND(nfsserver, krpc, 1, 1, 1); #endif /* * Set up nameidata for a lookup() call and do it. * * If pubflag is set, this call is done for a lookup operation on the * public filehandle. In that case we allow crossing mountpoints and * absolute pathnames. However, the caller is expected to check that * the lookup result is within the public fs, and deny access if * it is not. * * nfs_namei() clears out garbage fields that namei() might leave garbage. * This is mainly ni_vp and ni_dvp when an error occurs, and ni_dvp when no * error occurs but the parent was not requested. * * dirp may be set whether an error is returned or not, and must be * released by the caller. */ int nfs_namei(struct nameidata *ndp, struct nfsrv_descript *nfsd, fhandle_t *fhp, int len, struct nfssvc_sock *slp, struct sockaddr *nam, struct mbuf **mdp, caddr_t *dposp, struct vnode **retdirp, int v3, struct vattr *retdirattrp, int *retdirattr_retp, int pubflag) { int i, rem; struct mbuf *md; char *fromcp, *tocp, *cp; struct iovec aiov; struct uio auio; struct vnode *dp; int error, rdonly, linklen; struct componentname *cnp = &ndp->ni_cnd; int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0; int dvfslocked; int vfslocked; vfslocked = 0; dvfslocked = 0; *retdirp = NULL; cnp->cn_flags |= NOMACCHECK; cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || (!pubflag && *fromcp == '/')) { error = EACCES; goto out; } *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if ((error = nfs_adv(mdp, dposp, len, rem)) != 0) goto out; } /* * Extract and set starting directory. */ error = nfsrv_fhtovp(fhp, FALSE, &dp, &dvfslocked, nfsd, slp, nam, &rdonly, pubflag); if (error) goto out; vfslocked = VFS_LOCK_GIANT(dp->v_mount); if (dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; goto out; } if (rdonly) cnp->cn_flags |= RDONLY; /* * Set return directory. Reference to dp is implicitly transfered * to the returned pointer */ *retdirp = dp; if (v3) { vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); *retdirattr_retp = VOP_GETATTR(dp, retdirattrp, ndp->ni_cnd.cn_cred); VOP_UNLOCK(dp, 0); } if (pubflag) { /* * Oh joy. For WebNFS, handle those pesky '%' escapes, * and the 'native path' indicator. */ cp = uma_zalloc(namei_zone, M_WAITOK); fromcp = cnp->cn_pnbuf; tocp = cp; if ((unsigned char)*fromcp >= WEBNFS_SPECCHAR_START) { switch ((unsigned char)*fromcp) { case WEBNFS_NATIVE_CHAR: /* * 'Native' path for us is the same * as a path according to the NFS spec, * just skip the escape char. */ fromcp++; break; /* * More may be added in the future, range 0x80-0xff */ default: error = EIO; uma_zfree(namei_zone, cp); goto out; } } /* * Translate the '%' escapes, URL-style. */ while (*fromcp != '\0') { if (*fromcp == WEBNFS_ESC_CHAR) { if (fromcp[1] != '\0' && fromcp[2] != '\0') { fromcp++; *tocp++ = HEXSTRTOI(fromcp); fromcp += 2; continue; } else { error = ENOENT; uma_zfree(namei_zone, cp); goto out; } } else *tocp++ = *fromcp++; } *tocp = '\0'; uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } ndp->ni_pathlen = (tocp - cnp->cn_pnbuf) + 1; ndp->ni_segflg = UIO_SYSSPACE; if (pubflag) { ndp->ni_rootdir = rootvnode; ndp->ni_loopcnt = 0; if (cnp->cn_pnbuf[0] == '/') { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(rootvnode->v_mount); VFS_UNLOCK_GIANT(vfslocked); dp = rootvnode; vfslocked = tvfslocked; } } else { cnp->cn_flags |= NOCROSSMOUNT; } /* * Initialize for scan, set ni_startdir and bump ref on dp again * because lookup() will dereference ni_startdir. */ cnp->cn_thread = curthread; VREF(dp); ndp->ni_startdir = dp; if (!lockleaf) cnp->cn_flags |= LOCKLEAF; for (;;) { cnp->cn_nameptr = cnp->cn_pnbuf; /* * Call lookup() to do the real work. If an error occurs, * ndp->ni_vp and ni_dvp are left uninitialized or NULL and * we do not have to dereference anything before returning. * In either case ni_startdir will be dereferenced and NULLed * out. */ if (vfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; error = lookup(ndp); vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0; ndp->ni_cnd.cn_flags &= ~GIANTHELD; if (error) break; /* * Check for encountering a symbolic link. Trivial * termination occurs if no symlink encountered. * Note: zfree is safe because error is 0, so we will * not zfree it again when we break. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if (cnp->cn_flags & (SAVENAME | SAVESTART)) cnp->cn_flags |= HASBUF; else uma_zfree(namei_zone, cnp->cn_pnbuf); if (ndp->ni_vp && !lockleaf) VOP_UNLOCK(ndp->ni_vp, 0); break; } /* * Validate symlink */ if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) VOP_UNLOCK(ndp->ni_dvp, 0); if (!pubflag) { error = EINVAL; goto badlink2; } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; goto badlink2; } if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = NULL; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { badlink1: if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); badlink2: vput(ndp->ni_vp); vrele(ndp->ni_dvp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { error = ENOENT; goto badlink1; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { error = ENAMETOOLONG; goto badlink1; } /* * Adjust or replace path */ if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; /* * Cleanup refs for next loop and check if root directory * should replace current directory. Normally ni_dvp * becomes the new base directory and is cleaned up when * we loop. Explicitly null pointers after invalidation * to clarify operation. */ vput(ndp->ni_vp); ndp->ni_vp = NULL; if (cnp->cn_pnbuf[0] == '/') { vrele(ndp->ni_dvp); ndp->ni_dvp = ndp->ni_rootdir; VREF(ndp->ni_dvp); } ndp->ni_startdir = ndp->ni_dvp; ndp->ni_dvp = NULL; } if (!lockleaf) cnp->cn_flags &= ~LOCKLEAF; if (cnp->cn_flags & GIANTHELD) { mtx_unlock(&Giant); cnp->cn_flags &= ~GIANTHELD; } /* * nfs_namei() guarentees that fields will not contain garbage * whether an error occurs or not. This allows the caller to track * cleanup state trivially. */ out: if (error) { uma_zfree(namei_zone, cnp->cn_pnbuf); ndp->ni_vp = NULL; ndp->ni_dvp = NULL; ndp->ni_startdir = NULL; cnp->cn_flags &= ~HASBUF; VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; } else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) { ndp->ni_dvp = NULL; } /* * This differs from normal namei() in that even on failure we may * return with Giant held due to the dirp return. Make sure we only * have not recursed however. The calling code only expects to drop * one acquire. */ if (vfslocked || dvfslocked) ndp->ni_cnd.cn_flags |= GIANTHELD; if (vfslocked && dvfslocked) VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(struct mbuf *mp, int len, int nul) { struct mbuf *m; int count, i; char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == NULL) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(struct nfsrv_descript *nfsd, int before_ret, struct vattr *before_vap, int after_ret, struct vattr *after_vap, struct mbuf **mbp, char **bposp) { struct mbuf *mb = *mbp; char *bpos = *bposp; u_int32_t *tl; if (before_ret) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = nfsrv_nfs_false; } else { tl = nfsm_build(u_int32_t *, 7 * NFSX_UNSIGNED); *tl++ = nfsrv_nfs_true; txdr_hyper(before_vap->va_size, tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(struct nfsrv_descript *nfsd, int after_ret, struct vattr *after_vap, struct mbuf **mbp, char **bposp) { struct mbuf *mb = *mbp; char *bpos = *bposp; u_int32_t *tl; struct nfs_fattr *fp; if (after_ret) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); *tl = nfsrv_nfs_false; } else { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfsrv_nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(struct nfsrv_descript *nfsd, struct vattr *vap, struct nfs_fattr *fp) { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(vap->va_size, &fp->fa3_size); txdr_hyper(vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(umajor(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(uminor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhandle_t *fhp, int lockflag, struct vnode **vpp, int *vfslockedp, struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, struct sockaddr *nam, int *rdonlyp, int pubflag) { struct mount *mp; int i; struct ucred *cred, *credanon; int error, exflags; #ifdef MNT_EXNORESPORT /* XXX needs mountd and /etc/exports help yet */ struct sockaddr_int *saddr; #endif int credflavor; int vfslocked; int numsecflavors, *secflavors; int authsys; int v3 = nfsd->nd_flag & ND_NFSV3; int mountreq; *vfslockedp = 0; *vpp = NULL; if (nfs_ispublicfh(fhp)) { if (!pubflag || !nfs_pub.np_valid) return (ESTALE); fhp = &nfs_pub.np_handle; } - mp = vfs_getvfs(&fhp->fh_fsid); + mp = vfs_busyfs(&fhp->fh_fsid); if (!mp) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); error = VFS_CHECKEXP(mp, nam, &exflags, &credanon, &numsecflavors, &secflavors); - if (error) + if (error) { + vfs_unbusy(mp); goto out; + } if (numsecflavors == 0) { /* * This can happen if the system is running with an * old mountd that doesn't pass in a secflavor list. */ numsecflavors = 1; authsys = RPCAUTH_UNIX; secflavors = &authsys; } credflavor = nfsd->nd_credflavor; for (i = 0; i < numsecflavors; i++) { if (secflavors[i] == credflavor) break; } if (i == numsecflavors) { /* * RFC 2623 section 2.3.2 - allow certain procedures * used at NFS client mount time even if they have * weak authentication. */ mountreq = FALSE; if (v3) { if (nfsd->nd_procnum == NFSPROC_FSINFO || nfsd->nd_procnum == NFSPROC_GETATTR) mountreq = TRUE; } else { if (nfsd->nd_procnum == NFSPROC_FSSTAT || nfsd->nd_procnum == NFSPROC_GETATTR) mountreq = TRUE; } if (!mountreq) { error = NFSERR_AUTHERR | AUTH_TOOWEAK; + vfs_unbusy(mp); goto out; } } error = VFS_FHTOVP(mp, &fhp->fh_fid, vpp); + vfs_unbusy(mp); if (error) goto out; #ifdef MNT_EXNORESPORT if (!(exflags & (MNT_EXNORESPORT|MNT_EXPUBLIC))) { saddr = (struct sockaddr_in *)nam; if ((saddr->sin_family == AF_INET || saddr->sin_family == AF_INET6) && /* same code for INET and INET6: sin*_port at same offet */ ntohs(saddr->sin_port) >= IPPORT_RESERVED) { vput(*vpp); *vpp = NULL; error = NFSERR_AUTHERR | AUTH_TOOWEAK; } } #endif /* * Check/setup credentials. */ cred = nfsd->nd_cr; if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; if (!lockflag) VOP_UNLOCK(*vpp, 0); out: - vfs_rel(mp); if (error) { VFS_UNLOCK_GIANT(vfslocked); } else *vfslockedp = vfslocked; return (error); } /* * WebNFS: check if a filehandle is a public filehandle. For v3, this * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has * transformed this to all zeroes in both cases, so check for it. */ int nfs_ispublicfh(fhandle_t *fhp) { char *cp = (char *)fhp; int i; NFSD_LOCK_DONTCARE(); for (i = 0; i < NFSX_V3FH; i++) if (*cp++ != 0) return (FALSE); return (TRUE); } #ifdef NFS_LEGACYRPC /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(int family, union nethostaddr *haddr, struct sockaddr *nam) { struct sockaddr_in *inetaddr; NFSD_LOCK_DONTCARE(); switch (family) { case AF_INET: inetaddr = (struct sockaddr_in *)nam; if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef INET6 case AF_INET6: { register struct sockaddr_in6 *inet6addr1, *inet6addr2; inet6addr1 = (struct sockaddr_in6 *)nam; inet6addr2 = (struct sockaddr_in6 *)haddr->had_nam; /* XXX - should test sin6_scope_id ? */ if (inet6addr1->sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&inet6addr1->sin6_addr, &inet6addr2->sin6_addr)) return (1); break; } #endif default: break; }; return (0); } #endif /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(struct nfsrv_descript *nd, int err) { const short *defaulterrp, *errp; int e; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } e = 0; if (err <= ELAST) e = nfsrv_v2errmap[err - 1]; if (e != 0) return (e); return (NFSERR_IO); } /* * Sort the group list in increasing numerical order. * (Insertion sort by Chris Torek, who was grossed out by the bubble sort * that used to be here.) */ void nfsrvw_sort(gid_t *list, int num) { int i, j; gid_t v; /* Insertion sort. */ for (i = 1; i < num; i++) { v = list[i]; /* find correct slot for value v, moving others up */ for (j = i; --j >= 0 && v < list[j];) list[j + 1] = list[j]; list[j + 1] = v; } } /* * Helper functions for macros. */ void nfsm_srvfhtom_xx(fhandle_t *f, int v3, struct mbuf **mb, caddr_t *bpos) { u_int32_t *tl; if (v3) { tl = nfsm_build_xx(NFSX_UNSIGNED + NFSX_V3FH, mb, bpos); *tl++ = txdr_unsigned(NFSX_V3FH); bcopy(f, tl, NFSX_V3FH); } else { tl = nfsm_build_xx(NFSX_V2FH, mb, bpos); bcopy(f, tl, NFSX_V2FH); } } void nfsm_srvpostop_fh_xx(fhandle_t *f, struct mbuf **mb, caddr_t *bpos) { u_int32_t *tl; tl = nfsm_build_xx(2 * NFSX_UNSIGNED + NFSX_V3FH, mb, bpos); *tl++ = nfsrv_nfs_true; *tl++ = txdr_unsigned(NFSX_V3FH); bcopy(f, tl, NFSX_V3FH); } int nfsm_srvstrsiz_xx(int *s, int m, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *s = fxdr_unsigned(int32_t, *tl); if (*s > m || *s <= 0) return EBADRPC; return 0; } int nfsm_srvnamesiz_xx(int *s, int m, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; NFSD_LOCK_DONTCARE(); tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *s = fxdr_unsigned(int32_t, *tl); if (*s > m) return NFSERR_NAMETOL; if (*s <= 0) return EBADRPC; return 0; } int nfsm_srvnamesiz0_xx(int *s, int m, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; *s = fxdr_unsigned(int32_t, *tl); if (*s > m) return NFSERR_NAMETOL; if (*s < 0) return EBADRPC; return 0; } void nfsm_clget_xx(u_int32_t **tl, struct mbuf *mb, struct mbuf **mp, char **bp, char **be, caddr_t bpos) { struct mbuf *nmp; NFSD_UNLOCK_ASSERT(); if (*bp >= *be) { if (*mp == mb) (*mp)->m_len += *bp - bpos; MGET(nmp, M_WAIT, MT_DATA); MCLGET(nmp, M_WAIT); nmp->m_len = NFSMSIZ(nmp); (*mp)->m_next = nmp; *mp = nmp; *bp = mtod(*mp, caddr_t); *be = *bp + (*mp)->m_len; } *tl = (u_int32_t *)*bp; } int nfsm_srvmtofh_xx(fhandle_t *f, int v3, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; int fhlen; if (v3) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; fhlen = fxdr_unsigned(int, *tl); if (fhlen != 0 && fhlen != NFSX_V3FH) return EBADRPC; } else { fhlen = NFSX_V2FH; } if (fhlen != 0) { tl = nfsm_dissect_xx_nonblock(fhlen, md, dpos); if (tl == NULL) return EBADRPC; bcopy((caddr_t)tl, (caddr_t)(f), fhlen); } else { bzero((caddr_t)(f), NFSX_V3FH); } return 0; } int nfsm_srvsattr_xx(struct vattr *a, struct mbuf **md, caddr_t *dpos) { u_int32_t *tl; int toclient = 0; tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_mode = nfstov_mode(*tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_uid = fxdr_unsigned(uid_t, *tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_gid = fxdr_unsigned(gid_t, *tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; if (*tl == nfsrv_nfs_true) { tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; (a)->va_size = fxdr_hyper(tl); } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; fxdr_nfsv3time(tl, &(a)->va_atime); toclient = 1; break; case NFSV3SATTRTIME_TOSERVER: getnanotime(&(a)->va_atime); a->va_vaflags |= VA_UTIMES_NULL; break; } tl = nfsm_dissect_xx_nonblock(NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; switch (fxdr_unsigned(int, *tl)) { case NFSV3SATTRTIME_TOCLIENT: tl = nfsm_dissect_xx_nonblock(2 * NFSX_UNSIGNED, md, dpos); if (tl == NULL) return EBADRPC; fxdr_nfsv3time(tl, &(a)->va_mtime); a->va_vaflags &= ~VA_UTIMES_NULL; break; case NFSV3SATTRTIME_TOSERVER: getnanotime(&(a)->va_mtime); if (toclient == 0) a->va_vaflags |= VA_UTIMES_NULL; break; } return 0; } diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 32097532a4c1..51cff5d37e2f 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -1,779 +1,780 @@ /*- * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 * $FreeBSD$ */ #ifndef _SYS_MOUNT_H_ #define _SYS_MOUNT_H_ #include #include #ifdef _KERNEL #include #include #include #endif /* * NOTE: When changing statfs structure, mount structure, MNT_* flags or * MNTK_* flags also update DDB show mount command in vfs_subr.c. */ typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ /* * File identifier. * These are unique per filesystem on a single machine. */ #define MAXFIDSZ 16 struct fid { u_short fid_len; /* length of data in bytes */ u_short fid_reserved; /* force longword alignment */ char fid_data[MAXFIDSZ]; /* data (variable length) */ }; /* * filesystem statistics */ #define MFSNAMELEN 16 /* length of type name including null */ #define MNAMELEN 88 /* size of on/from name bufs */ #define STATFS_VERSION 0x20030518 /* current version number */ struct statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[MFSNAMELEN]; /* filesystem type name */ char f_mntfromname[MNAMELEN]; /* mounted filesystem */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ }; #ifdef _KERNEL #define OMFSNAMELEN 16 /* length of fs type name, including null */ #define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ /* XXX getfsstat.2 is out of date with write and read counter changes here. */ /* XXX statfs.2 is out of date with read counter changes here. */ struct ostatfs { long f_spare2; /* placeholder */ long f_bsize; /* fundamental filesystem block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in filesystem */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in filesystem */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* filesystem id */ uid_t f_owner; /* user that mounted the filesystem */ int f_type; /* type of filesystem */ int f_flags; /* copy of mount exported flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ char f_fstypename[OMFSNAMELEN]; /* fs type name */ char f_mntonname[OMNAMELEN]; /* directory on which mounted */ long f_syncreads; /* count of sync reads since mount */ long f_asyncreads; /* count of async reads since mount */ short f_spares1; /* unused spare */ char f_mntfromname[OMNAMELEN];/* mounted filesystem */ short f_spares2; /* unused spare */ /* * XXX on machines where longs are aligned to 8-byte boundaries, there * is an unnamed int32_t here. This spare was after the apparent end * of the struct until we bit off the read counters from f_mntonname. */ long f_spare[2]; /* unused spare */ }; #define MMAXOPTIONLEN 65536 /* maximum length of a mount option */ TAILQ_HEAD(vnodelst, vnode); struct vfsoptlist; struct vfsopt; /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are * put on a doubly linked list. * * Lock reference: * m - mountlist_mtx * i - interlock * * Unmarked fields are considered stable as long as a ref is held. * */ struct mount { struct mtx mnt_mtx; /* mount structure interlock */ int mnt_gen; /* struct mount generation */ #define mnt_startzero mnt_list TAILQ_ENTRY(mount) mnt_list; /* (m) mount list */ struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ struct vnode *mnt_vnodecovered; /* vnode we mounted on */ struct vnode *mnt_syncer; /* syncer vnode */ int mnt_ref; /* (i) Reference count */ struct vnodelst mnt_nvnodelist; /* (i) list of vnodes */ int mnt_nvnodelistsize; /* (i) # of vnodes */ int mnt_writeopcount; /* (i) write syscalls pending */ int mnt_kern_flag; /* (i) kernel only flags */ u_int mnt_flag; /* (i) flags shared with user */ u_int mnt_noasync; /* (i) # noasync overrides */ struct vfsoptlist *mnt_opt; /* current mount options */ struct vfsoptlist *mnt_optnew; /* new options passed to fs */ int mnt_maxsymlinklen; /* max size of short symlink */ struct statfs mnt_stat; /* cache of filesystem stats */ struct ucred *mnt_cred; /* credentials of mounter */ void * mnt_data; /* private data */ time_t mnt_time; /* last time written*/ int mnt_iosize_max; /* max size for clusters, etc */ struct netexport *mnt_export; /* export list */ struct label *mnt_label; /* MAC label for the fs */ u_int mnt_hashseed; /* Random seed for vfs_hash */ int mnt_lockref; /* (i) Lock reference count */ int mnt_secondary_writes; /* (i) # of secondary writes */ int mnt_secondary_accwrites;/* (i) secondary wr. starts */ struct thread *mnt_susp_owner; /* (i) thread owning suspension */ #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ struct lock mnt_explock; /* vfs_export walkers lock */ }; struct vnode *__mnt_vnode_next(struct vnode **mvp, struct mount *mp); struct vnode *__mnt_vnode_first(struct vnode **mvp, struct mount *mp); void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp); #define MNT_VNODE_FOREACH(vp, mp, mvp) \ for (vp = __mnt_vnode_first(&(mvp), (mp)); \ (vp) != NULL; vp = __mnt_vnode_next(&(mvp), (mp))) #define MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp) \ __mnt_vnode_markerfree(&(mvp), (mp)) #define MNT_VNODE_FOREACH_ABORT(mp, mvp) \ do { \ MNT_ILOCK(mp); \ MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); \ MNT_IUNLOCK(mp); \ } while (0) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) #define MNT_REF(mp) (mp)->mnt_ref++ #define MNT_REL(mp) do { \ (mp)->mnt_ref--; \ if ((mp)->mnt_ref == 0) \ wakeup((mp)); \ } while (0) #endif /* _KERNEL */ /* * User specifiable flags. */ #define MNT_RDONLY 0x00000001 /* read only filesystem */ #define MNT_SYNCHRONOUS 0x00000002 /* filesystem written synchronously */ #define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */ #define MNT_NOSUID 0x00000008 /* don't honor setuid bits on fs */ #define MNT_UNION 0x00000020 /* union with underlying filesystem */ #define MNT_ASYNC 0x00000040 /* filesystem written asynchronously */ #define MNT_SUIDDIR 0x00100000 /* special handling of SUID on dirs */ #define MNT_SOFTDEP 0x00200000 /* soft updates being done */ #define MNT_NOSYMFOLLOW 0x00400000 /* do not follow symlinks */ #define MNT_GJOURNAL 0x02000000 /* GEOM journal support enabled */ #define MNT_MULTILABEL 0x04000000 /* MAC support for individual objects */ #define MNT_ACLS 0x08000000 /* ACL support enabled */ #define MNT_NOATIME 0x10000000 /* disable update of file access time */ #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */ #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */ /* * NFS export related mount flags. */ #define MNT_EXRDONLY 0x00000080 /* exported read only */ #define MNT_EXPORTED 0x00000100 /* filesystem is exported */ #define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ #define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone */ #define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping */ #define MNT_EXPUBLIC 0x20000000 /* public export (WebNFS) */ /* * Flags set by internal operations, * but visible to the user. * XXX some of these are not quite right.. (I've never seen the root flag set) */ #define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ #define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ #define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ #define MNT_USER 0x00008000 /* mounted by a user */ #define MNT_IGNORE 0x00800000 /* do not show entry in df */ /* * Mask of flags that are visible to statfs(). * XXX I think that this could now become (~(MNT_CMDFLAGS)) * but the 'mount' program may need changing to handle this. */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_UNION | \ MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_USER | MNT_QUOTA | \ MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ MNT_NOATIME | \ MNT_NOSYMFOLLOW | MNT_IGNORE | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS | MNT_USER) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. * XXX: These are not STATES and really should be somewhere else. * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for * mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless. */ #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ #define MNT_DELEXPORT 0x00020000 /* delete export host lists */ #define MNT_RELOAD 0x00040000 /* reload filesystem data */ #define MNT_FORCE 0x00080000 /* force unmount or readonly change */ #define MNT_SNAPSHOT 0x01000000 /* snapshot the filesystem */ #define MNT_BYFSID 0x08000000 /* specify filesystem by ID. */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_BYFSID) /* * Still available. */ #define MNT_SPARE_0x00000010 0x00000010 /* * Internal filesystem control flags stored in mnt_kern_flag. * * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed * past the mount point. This keeps the subtree stable during mounts * and unmounts. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). * * MNTK_NOINSMNTQ is strict subset of MNTK_UNMOUNT. They are separated * to allow for failed unmount attempt to restore the syncer vnode for * the mount. */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_ASYNC 0x00000002 /* filtered async flag */ #define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */ #define MNTK_NOINSMNTQ 0x00000008 /* insmntque is not allowed */ #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPEND2 0x04000000 /* block secondary writes */ #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ #define MNTK_MPSAFE 0x20000000 /* Filesystem is MPSAFE. */ #define MNTK_NOKNOTE 0x80000000 /* Don't send KNOTEs from VOP hooks */ #define MNTK_LOOKUP_SHARED 0x40000000 /* FS supports shared lock lookups */ /* * Sysctl CTL_VFS definitions. * * Second level identifier specifies which filesystem. Second level * identifier VFS_VFSCONF returns information about all filesystems. * Second level identifier VFS_GENERIC is non-terminal. */ #define VFS_VFSCONF 0 /* get configured filesystems */ #define VFS_GENERIC 0 /* generic filesystem information */ /* * Third level identifiers for VFS_GENERIC are given below; third * level identifiers for specific filesystems are given in their * mount specific header files. */ #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ /* * Flags for various system call interfaces. * * waitfor flags to vfs_sync() and getfsstat() */ #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_LAZY 3 /* push data not written by filesystem syncer */ #define MNT_SUSPEND 4 /* Suspend file system after sync */ /* * Generic file handle */ struct fhandle { fsid_t fh_fsid; /* Filesystem id of mount point */ struct fid fh_fid; /* Filesys specific id */ }; typedef struct fhandle fhandle_t; /* * Old export arguments without security flavor list */ struct oexport_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ }; /* * Export arguments for local filesystem mount calls. */ #define MAXSECFLAVORS 5 struct export_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ int ex_numsecflavors; /* security flavor count */ int ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */ }; /* * Structure holding information for a publicly exported filesystem * (WebNFS). Currently the specs allow just for one such filesystem. */ struct nfs_public { int np_valid; /* Do we hold valid information */ fhandle_t np_handle; /* Filehandle for pub fs (internal) */ struct mount *np_mount; /* Mountpoint of exported fs */ char *np_index; /* Index file */ }; /* * Filesystem configuration information. One of these exists for each * type of filesystem supported by the kernel. These are searched at * mount time to identify the requested filesystem. * * XXX: Never change the first two arguments! */ struct vfsconf { u_int vfc_version; /* ABI version number */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ struct vfsops *vfc_vfsops; /* filesystem operations vector */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsoptdecl *vfc_opts; /* mount options */ TAILQ_ENTRY(vfsconf) vfc_list; /* list of vfscons */ }; /* Userland version of the struct vfsconf. */ struct xvfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsconf *vfc_next; /* next in list */ }; #ifndef BURN_BRIDGES struct ovfsconf { void *vfc_vfsops; char vfc_name[32]; int vfc_index; int vfc_refcount; int vfc_flags; }; #endif /* * NB: these flags refer to IMPLEMENTATION properties, not properties of * any actual mounts; i.e., it does not make sense to change the flags. */ #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ #define VFCF_NETWORK 0x00020000 /* may get data over the network */ #define VFCF_READONLY 0x00040000 /* writes are not implemented */ #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files */ #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode */ #define VFCF_JAIL 0x00400000 /* can be mounted from within a jail */ #define VFCF_DELEGADMIN 0x00800000 /* supports delegated administration */ typedef uint32_t fsctlop_t; struct vfsidctl { int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ fsid_t vc_fsid; /* fsid to operate on */ char vc_fstypename[MFSNAMELEN]; /* type of fs 'nfs' or '*' */ fsctlop_t vc_op; /* operation VFS_CTL_* (below) */ void *vc_ptr; /* pointer to data structure */ size_t vc_len; /* sizeof said structure */ u_int32_t vc_spare[12]; /* spare (must be zero) */ }; /* vfsidctl API version. */ #define VFS_CTL_VERS1 0x01 /* * New style VFS sysctls, do not reuse/conflict with the namespace for * private sysctls. * All "global" sysctl ops have the 33rd bit set: * 0x...1.... * Private sysctl ops should have the 33rd bit unset. */ #define VFS_CTL_QUERY 0x00010001 /* anything wrong? (vfsquery) */ #define VFS_CTL_TIMEO 0x00010002 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010003 /* disable file locking */ struct vfsquery { u_int32_t vq_flags; u_int32_t vq_spare[31]; }; /* vfsquery flags */ #define VQ_NOTRESP 0x0001 /* server down */ #define VQ_NEEDAUTH 0x0002 /* server bad auth */ #define VQ_LOWDISK 0x0004 /* we're low on space */ #define VQ_MOUNT 0x0008 /* new filesystem arrived */ #define VQ_UNMOUNT 0x0010 /* filesystem has left */ #define VQ_DEAD 0x0020 /* filesystem is dead, needs force unmount */ #define VQ_ASSIST 0x0040 /* filesystem needs assistance from external program */ #define VQ_NOTRESPLOCK 0x0080 /* server lockd down */ #define VQ_FLAG0100 0x0100 /* placeholder */ #define VQ_FLAG0200 0x0200 /* placeholder */ #define VQ_FLAG0400 0x0400 /* placeholder */ #define VQ_FLAG0800 0x0800 /* placeholder */ #define VQ_FLAG1000 0x1000 /* placeholder */ #define VQ_FLAG2000 0x2000 /* placeholder */ #define VQ_FLAG4000 0x4000 /* placeholder */ #define VQ_FLAG8000 0x8000 /* placeholder */ #ifdef _KERNEL /* Point a sysctl request at a vfsidctl's data. */ #define VCTLTOREQ(vc, req) \ do { \ (req)->newptr = (vc)->vc_ptr; \ (req)->newlen = (vc)->vc_len; \ (req)->newidx = 0; \ } while (0) #endif struct iovec; struct uio; #ifdef _KERNEL /* * vfs_busy specific flags and mask. */ #define MBF_NOWAIT 0x01 #define MBF_MNTLSTLOCK 0x02 #define MBF_MASK (MBF_NOWAIT | MBF_MNTLSTLOCK) #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MOUNT); #endif extern int maxvfsconf; /* highest defined filesystem type */ extern int nfs_mount_type; /* vfc_typenum for nfs, or -1 */ TAILQ_HEAD(vfsconfhead, vfsconf); extern struct vfsconfhead vfsconf; /* * Operations supported on mounted filesystem. */ struct mount_args; struct nameidata; struct sysctl_req; struct mntarg; typedef int vfs_cmount_t(struct mntarg *ma, void *data, int flags, struct thread *td); typedef int vfs_unmount_t(struct mount *mp, int mntflags, struct thread *td); typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp, struct thread *td); typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg, struct thread *td); typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp, struct thread *td); typedef int vfs_sync_t(struct mount *mp, int waitfor, struct thread *td); typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, struct vnode **vpp); typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); typedef int vfs_init_t(struct vfsconf *); typedef int vfs_uninit_t(struct vfsconf *); typedef int vfs_extattrctl_t(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td); typedef int vfs_mount_t(struct mount *mp, struct thread *td); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); struct vfsops { vfs_mount_t *vfs_mount; vfs_cmount_t *vfs_cmount; vfs_unmount_t *vfs_unmount; vfs_root_t *vfs_root; vfs_quotactl_t *vfs_quotactl; vfs_statfs_t *vfs_statfs; vfs_sync_t *vfs_sync; vfs_vget_t *vfs_vget; vfs_fhtovp_t *vfs_fhtovp; vfs_checkexp_t *vfs_checkexp; vfs_init_t *vfs_init; vfs_uninit_t *vfs_uninit; vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; }; vfs_statfs_t __vfs_statfs; #define VFS_MOUNT(MP, P) (*(MP)->mnt_op->vfs_mount)(MP, P) #define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P) #define VFS_ROOT(MP, FLAGS, VPP, P) \ (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP, P) #define VFS_QUOTACTL(MP,C,U,A,P) (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P) #define VFS_STATFS(MP, SBP, P) __vfs_statfs((MP), (SBP), (P)) #define VFS_SYNC(MP, WAIT, P) (*(MP)->mnt_op->vfs_sync)(MP, WAIT, P) #define VFS_VGET(MP, INO, FLAGS, VPP) \ (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP) #define VFS_FHTOVP(MP, FIDP, VPP) \ (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, VPP) #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED, NUMSEC, SEC) \ (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED, NUMSEC, SEC) #define VFS_EXTATTRCTL(MP, C, FN, NS, N, P) \ (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N, P) #define VFS_SYSCTL(MP, OP, REQ) \ (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ) #define VFS_SUSP_CLEAN(MP) \ ({if (*(MP)->mnt_op->vfs_susp_clean != NULL) \ (*(MP)->mnt_op->vfs_susp_clean)(MP); }) extern int mpsafe_vfs; #define VFS_NEEDSGIANT_(MP) \ (!mpsafe_vfs || ((MP) != NULL && ((MP)->mnt_kern_flag & MNTK_MPSAFE) == 0)) #define VFS_NEEDSGIANT(MP) __extension__ \ ({ \ struct mount *_mp; \ _mp = (MP); \ VFS_NEEDSGIANT_(_mp); \ }) #define VFS_LOCK_GIANT(MP) __extension__ \ ({ \ int _locked; \ struct mount *_mp; \ _mp = (MP); \ if (VFS_NEEDSGIANT_(_mp)) { \ mtx_lock(&Giant); \ _locked = 1; \ } else \ _locked = 0; \ _locked; \ }) #define VFS_UNLOCK_GIANT(locked) if ((locked)) mtx_unlock(&Giant); #define VFS_ASSERT_GIANT(MP) do \ { \ struct mount *_mp; \ _mp = (MP); \ if (VFS_NEEDSGIANT_(_mp)) \ mtx_assert(&Giant, MA_OWNED); \ } while (0) #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 1); \ } while (0) #define VFS_KNOTE_UNLOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 0); \ } while (0) #include /* * Version numbers. */ #define VFS_VERSION_00 0x19660120 #define VFS_VERSION VFS_VERSION_00 #define VFS_SET(vfsops, fsname, flags) \ static struct vfsconf fsname ## _vfsconf = { \ .vfc_version = VFS_VERSION, \ .vfc_name = #fsname, \ .vfc_vfsops = &vfsops, \ .vfc_typenum = -1, \ .vfc_flags = flags, \ }; \ static moduledata_t fsname ## _mod = { \ #fsname, \ vfs_modevent, \ & fsname ## _vfsconf \ }; \ DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) extern char *mountrootfsname; /* * exported vnode operations */ int dounmount(struct mount *, int, struct thread *); int kernel_mount(struct mntarg *ma, int flags); int kernel_vmount(int flags, ...); struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len); struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name); struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...); struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len); void statfs_scale_blocks(struct statfs *sf, long max_size); struct vfsconf *vfs_byname(const char *); struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *); void vfs_mount_destroy(struct mount *); void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); void vfs_freeopts(struct vfsoptlist *opts); void vfs_deleteopt(struct vfsoptlist *opts, const char *name); int vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val); int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); char *vfs_getopts(struct vfsoptlist *, const char *, int *error); int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); int vfs_filteropt(struct vfsoptlist *, const char **legal); int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); void vfs_msync(struct mount *, int); int vfs_busy(struct mount *, int); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); int vfs_allocate_syncvnode(struct mount *); int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions); void vfs_getnewfsid(struct mount *); struct cdev *vfs_getrootfsid(struct mount *); struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ +struct mount *vfs_busyfs(fsid_t *); int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); void vfs_ref(struct mount *); void vfs_rel(struct mount *); struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *, struct ucred *); int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx mountlist_mtx; extern struct nfs_public nfs_pub; /* * Declarations for these vfs default operations are located in * kern/vfs_default.c, they should be used instead of making "dummy" * functions or casting entries in the VFS op table to "enopnotsupp()". */ vfs_root_t vfs_stdroot; vfs_quotactl_t vfs_stdquotactl; vfs_statfs_t vfs_stdstatfs; vfs_sync_t vfs_stdsync; vfs_sync_t vfs_stdnosync; vfs_vget_t vfs_stdvget; vfs_fhtovp_t vfs_stdfhtovp; vfs_checkexp_t vfs_stdcheckexp; vfs_init_t vfs_stdinit; vfs_uninit_t vfs_stduninit; vfs_extattrctl_t vfs_stdextattrctl; vfs_sysctl_t vfs_stdsysctl; #else /* !_KERNEL */ #include struct stat; __BEGIN_DECLS int fhopen(const struct fhandle *, int); int fhstat(const struct fhandle *, struct stat *); int fhstatfs(const struct fhandle *, struct statfs *); int fstatfs(int, struct statfs *); int getfh(const char *, fhandle_t *); int getfsstat(struct statfs *, long, int); int getmntinfo(struct statfs **, int); int lgetfh(const char *, fhandle_t *); int mount(const char *, const char *, int, void *); int nmount(struct iovec *, unsigned int, int); int statfs(const char *, struct statfs *); int unmount(const char *, int); /* C library stuff */ int getvfsbyname(const char *, struct xvfsconf *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_MOUNT_H_ */